CINXE.COM
Robotics
<!DOCTYPE html> <html lang="en"> <head> <title>Robotics </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.RO/recent">cs.RO</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Robotics</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item22">Cross-lists</a></li> <li><a href="#item38">Replacements</a></li> </ul> <p>See <a id="recent-cs.RO" aria-labelledby="recent-cs.RO" href="/list/cs.RO/recent">recent</a> articles</p> <h3>Showing new listings for Friday, 21 March 2025</h3> <div class='paging'>Total of 58 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.RO/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 21 of 21 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.15629" title="Abstract" id="2503.15629"> arXiv:2503.15629 </a> [<a href="/pdf/2503.15629" title="Download PDF" id="pdf-2503.15629" aria-labelledby="pdf-2503.15629">pdf</a>, <a href="https://arxiv.org/html/2503.15629v1" title="View HTML" id="html-2503.15629" aria-labelledby="html-2503.15629" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15629" title="Other formats" id="oth-2503.15629" aria-labelledby="oth-2503.15629">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neural Lyapunov Function Approximation with Self-Supervised Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=McCutcheon,+L">Luc McCutcheon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gharesifard,+B">Bahman Gharesifard</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fallah,+S">Saber Fallah</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at IEEE International Conference on Robotics and Automation (ICRA) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computational Geometry (cs.CG); Machine Learning (cs.LG) </div> <p class='mathjax'> Control Lyapunov functions are traditionally used to design a controller which ensures convergence to a desired state, yet deriving these functions for nonlinear systems remains a complex challenge. This paper presents a novel, sample-efficient method for neural approximation of nonlinear Lyapunov functions, leveraging self-supervised Reinforcement Learning (RL) to enhance training data generation, particularly for inaccurately represented regions of the state space. The proposed approach employs a data-driven World Model to train Lyapunov functions from off-policy trajectories. The method is validated on both standard and goal-conditioned robotic tasks, demonstrating faster convergence and higher approximation accuracy compared to the state-of-the-art neural Lyapunov approximation baseline. The code is available at: <a href="https://github.com/CAV-Research-Lab/SACLA.git" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.15685" title="Abstract" id="2503.15685"> arXiv:2503.15685 </a> [<a href="/pdf/2503.15685" title="Download PDF" id="pdf-2503.15685" aria-labelledby="pdf-2503.15685">pdf</a>, <a href="https://arxiv.org/html/2503.15685v1" title="View HTML" id="html-2503.15685" aria-labelledby="html-2503.15685" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15685" title="Other formats" id="oth-2503.15685" aria-labelledby="oth-2503.15685">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robotic Paper Wrapping by Learning Force Control </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hanai,+H">Hiroki Hanai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kiyokawa,+T">Takuya Kiyokawa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wan,+W">Weiwei Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Harada,+K">Kensuke Harada</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Robotic packaging using wrapping paper poses significant challenges due to the material's complex deformation properties. The packaging process itself involves multiple steps, primarily categorized as folding the paper or creating creases. Small deviations in the robot's arm trajectory or force vector can lead to tearing or wrinkling of the paper, exacerbated by the variability in material properties. <br>This study introduces a novel framework that combines imitation learning and reinforcement learning to enable a robot to perform each step of the packaging process efficiently. The framework allows the robot to follow approximate trajectories of the tool-center point (TCP) based on human demonstrations while optimizing force control parameters to prevent tearing or wrinkling, even with variable wrapping paper materials. <br>The proposed method was validated through ablation studies, which demonstrated successful task completion with a significant reduction in tear and wrinkle rates. Furthermore, the force control strategy proved to be adaptable across different wrapping paper materials and robust against variations in the size of the target object. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.15688" title="Abstract" id="2503.15688"> arXiv:2503.15688 </a> [<a href="/pdf/2503.15688" title="Download PDF" id="pdf-2503.15688" aria-labelledby="pdf-2503.15688">pdf</a>, <a href="https://arxiv.org/html/2503.15688v1" title="View HTML" id="html-2503.15688" aria-labelledby="html-2503.15688" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15688" title="Other formats" id="oth-2503.15688" aria-labelledby="oth-2503.15688">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Capturing a Moving Target by Two Robots in the F2F Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jawhar,+K">Khaled Jawhar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kranakis,+E">Evangelos Kranakis</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Distributed, Parallel, and Cluster Computing (cs.DC) </div> <p class='mathjax'> We study a search problem on capturing a moving target on an infinite real line. Two autonomous mobile robots (which can move with a maximum speed of 1) are initially placed at the origin, while an oblivious moving target is initially placed at a distance $d$ away from the origin. The robots can move along the line in any direction, but the target is oblivious, cannot change direction, and moves either away from or toward the origin at a constant speed $v$. Our aim is to design efficient algorithms for the two robots to capture the target. The target is captured only when both robots are co-located with it. The robots communicate with each other only face-to-face (F2F), meaning they can exchange information only when co-located, while the target remains oblivious and has no communication capabilities. <br>We design algorithms under various knowledge scenarios, which take into account the prior knowledge the robots have about the starting distance $d$, the direction of movement (either toward or away from the origin), and the speed $v$ of the target. As a measure of the efficiency of the algorithms, we use the competitive ratio, which is the ratio of the capture time of an algorithm with limited knowledge to the capture time in the full-knowledge model. In our analysis, we are mindful of the cost of changing direction of movement, and show how to accomplish the capture of the target with at most three direction changes (turns). </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.15707" title="Abstract" id="2503.15707"> arXiv:2503.15707 </a> [<a href="/pdf/2503.15707" title="Download PDF" id="pdf-2503.15707" aria-labelledby="pdf-2503.15707">pdf</a>, <a href="https://arxiv.org/html/2503.15707v1" title="View HTML" id="html-2503.15707" aria-labelledby="html-2503.15707" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15707" title="Other formats" id="oth-2503.15707" aria-labelledby="oth-2503.15707">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Safety Aware Task Planning via Large Language Models in Robotics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+A+A">Azal Ahmad Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Andrev,+M">Michael Andrev</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Murtaza,+M+A">Muhammad Ali Murtaza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aguilera,+S">Sergio Aguilera</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Rui Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+J">Jie Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hutchinson,+S">Seth Hutchinson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anwar,+A">Ali Anwar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The integration of large language models (LLMs) into robotic task planning has unlocked better reasoning capabilities for complex, long-horizon workflows. However, ensuring safety in LLM-driven plans remains a critical challenge, as these models often prioritize task completion over risk mitigation. This paper introduces SAFER (Safety-Aware Framework for Execution in Robotics), a multi-LLM framework designed to embed safety awareness into robotic task planning. SAFER employs a Safety Agent that operates alongside the primary task planner, providing safety feedback. Additionally, we introduce LLM-as-a-Judge, a novel metric leveraging LLMs as evaluators to quantify safety violations within generated task plans. Our framework integrates safety feedback at multiple stages of execution, enabling real-time risk assessment, proactive error correction, and transparent safety evaluation. We also integrate a control framework using Control Barrier Functions (CBFs) to ensure safety guarantees within SAFER's task planning. We evaluated SAFER against state-of-the-art LLM planners on complex long-horizon tasks involving heterogeneous robotic agents, demonstrating its effectiveness in reducing safety violations while maintaining task efficiency. We also verify the task planner and safety planner through actual hardware experiments involving multiple robots and a human. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.15715" title="Abstract" id="2503.15715"> arXiv:2503.15715 </a> [<a href="/pdf/2503.15715" title="Download PDF" id="pdf-2503.15715" aria-labelledby="pdf-2503.15715">pdf</a>, <a href="/format/2503.15715" title="Other formats" id="oth-2503.15715" aria-labelledby="oth-2503.15715">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Experience-based Optimal Motion Planning Algorithm for Solving Difficult Planning Problems Using a Limited Dataset </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Takamido,+R">Ryota Takamido</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ota,+J">Jun Ota</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> This study aims to address the key challenge of obtaining a high-quality solution path within a short calculation time by generalizing a limited dataset. In the informed experience-driven random trees connect star (IERTC*) process, the algorithm flexibly explores the search trees by morphing the micro paths generated from a single experience while reducing the path cost by introducing a re-wiring process and an informed sampling process. The core idea of this algorithm is to apply different strategies depending on the complexity of the local environment; for example, it adopts a more complex curved trajectory if obstacles are densely arranged near the search tree, and it adopts a simpler straight line if the local environment is sparse. The results of experiments using a general motion benchmark test revealed that IERTC* significantly improved the planning success rate in difficult problems in the cluttered environment (an average improvement of 49.3% compared to the state-of-the-art algorithm) while also significantly reducing the solution cost (a reduction of 56.3%) when using one hundred experiences. Furthermore, the results demonstrated outstanding planning performance even when only one experience was available (a 43.8% improvement in success rate and a 57.8% reduction in solution cost). </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.15724" title="Abstract" id="2503.15724"> arXiv:2503.15724 </a> [<a href="/pdf/2503.15724" title="Download PDF" id="pdf-2503.15724" aria-labelledby="pdf-2503.15724">pdf</a>, <a href="https://arxiv.org/html/2503.15724v1" title="View HTML" id="html-2503.15724" aria-labelledby="html-2503.15724" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15724" title="Other formats" id="oth-2503.15724" aria-labelledby="oth-2503.15724">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reward Training Wheels: Adaptive Auxiliary Rewards for Robotics Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Linji Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+T">Tong Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yuanjie Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+X">Xuesu Xiao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Robotics Reinforcement Learning (RL) often relies on carefully engineered auxiliary rewards to supplement sparse primary learning objectives to compensate for the lack of large-scale, real-world, trial-and-error data. While these auxiliary rewards accelerate learning, they require significant engineering effort, may introduce human biases, and cannot adapt to the robot's evolving capabilities during training. In this paper, we introduce Reward Training Wheels (RTW), a teacher-student framework that automates auxiliary reward adaptation for robotics RL. To be specific, the RTW teacher dynamically adjusts auxiliary reward weights based on the student's evolving capabilities to determine which auxiliary reward aspects require more or less emphasis to improve the primary objective. We demonstrate RTW on two challenging robot tasks: navigation in highly constrained spaces and off-road vehicle mobility on vertically challenging terrain. In simulation, RTW outperforms expert-designed rewards by 2.35% in navigation success rate and improves off-road mobility performance by 122.62%, while achieving 35% and 3X faster training efficiency, respectively. Physical robot experiments further validate RTW's effectiveness, achieving a perfect success rate (5/5 trials vs. 2/5 for expert-designed rewards) and improving vehicle stability with up to 47.4% reduction in orientation angles. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2503.15781" title="Abstract" id="2503.15781"> arXiv:2503.15781 </a> [<a href="/pdf/2503.15781" title="Download PDF" id="pdf-2503.15781" aria-labelledby="pdf-2503.15781">pdf</a>, <a href="https://arxiv.org/html/2503.15781v1" title="View HTML" id="html-2503.15781" aria-labelledby="html-2503.15781" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15781" title="Other formats" id="oth-2503.15781" aria-labelledby="oth-2503.15781">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UAS Visual Navigation in Large and Unseen Environments via a Meta Agent </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+Y">Yuci Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toth,+C">Charles Toth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yilmaz,+A">Alper Yilmaz</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The aim of this work is to develop an approach that enables Unmanned Aerial System (UAS) to efficiently learn to navigate in large-scale urban environments and transfer their acquired expertise to novel environments. To achieve this, we propose a meta-curriculum training scheme. First, meta-training allows the agent to learn a master policy to generalize across tasks. The resulting model is then fine-tuned on the downstream tasks. We organize the training curriculum in a hierarchical manner such that the agent is guided from coarse to fine towards the target task. In addition, we introduce Incremental Self-Adaptive Reinforcement learning (ISAR), an algorithm that combines the ideas of incremental learning and meta-reinforcement learning (MRL). In contrast to traditional reinforcement learning (RL), which focuses on acquiring a policy for a specific task, MRL aims to learn a policy with fast transfer ability to novel tasks. However, the MRL training process is time consuming, whereas our proposed ISAR algorithm achieves faster convergence than the conventional MRL algorithm. We evaluate the proposed methodologies in simulated environments and demonstrate that using this training philosophy in conjunction with the ISAR algorithm significantly improves the convergence speed for navigation in large-scale cities and the adaptation proficiency in novel environments. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2503.15819" title="Abstract" id="2503.15819"> arXiv:2503.15819 </a> [<a href="/pdf/2503.15819" title="Download PDF" id="pdf-2503.15819" aria-labelledby="pdf-2503.15819">pdf</a>, <a href="https://arxiv.org/html/2503.15819v1" title="View HTML" id="html-2503.15819" aria-labelledby="html-2503.15819" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15819" title="Other formats" id="oth-2503.15819" aria-labelledby="oth-2503.15819">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Control Pneumatic Soft Bending Actuator with Online Learning Pneumatic Physical Reservoir Computing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+J">Junyi Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miyazaki,+T">Tetsuro Miyazaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kawashima,+K">Kenji Kawashima</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 13 figures, IEEE-RAS International Conference on Soft Robotics (RoboSoft 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Machine Learning (cs.LG); Systems and Control (eess.SY) </div> <p class='mathjax'> The intrinsic nonlinearities of soft robots present significant control but simultaneously provide them with rich computational potential. Reservoir computing (RC) has shown effectiveness in online learning systems for controlling nonlinear systems such as soft actuators. Conventional RC can be extended into physical reservoir computing (PRC) by leveraging the nonlinear dynamics of soft actuators for computation. This paper introduces a PRC-based online learning framework to control the motion of a pneumatic soft bending actuator, utilizing another pneumatic soft actuator as the PRC model. Unlike conventional designs requiring two RC models, the proposed control system employs a more compact architecture with a single RC model. Additionally, the framework enables zero-shot online learning, addressing limitations of previous PRC-based control systems reliant on offline training. Simulations and experiments validated the performance of the proposed system. Experimental results indicate that the PRC model achieved superior control performance compared to a linear model, reducing the root-mean-square error (RMSE) by an average of over 37% in bending motion control tasks. The proposed PRC-based online learning control framework provides a novel approach for harnessing physical systems' inherent nonlinearities to enhance the control of soft actuators. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.15836" title="Abstract" id="2503.15836"> arXiv:2503.15836 </a> [<a href="/pdf/2503.15836" title="Download PDF" id="pdf-2503.15836" aria-labelledby="pdf-2503.15836">pdf</a>, <a href="https://arxiv.org/html/2503.15836v1" title="View HTML" id="html-2503.15836" aria-labelledby="html-2503.15836" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15836" title="Other formats" id="oth-2503.15836" aria-labelledby="oth-2503.15836">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> APEX-MR: Multi-Robot Asynchronous Planning and Execution for Cooperative Assembly </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+P">Philip Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+R">Ruixuan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Changliu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiaoyang Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 pages, 11 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> Compared to a single-robot workstation, a multi-robot system offers several advantages: 1) it expands the system's workspace, 2) improves task efficiency, and more importantly, 3) enables robots to achieve significantly more complex and dexterous tasks, such as cooperative assembly. However, coordinating the tasks and motions of multiple robots is challenging due to issues, e.g. system uncertainty, task efficiency, algorithm scalability, and safety concerns. To address these challenges, this paper studies multi-robot coordination and proposes APEX-MR, an asynchronous planning and execution framework designed to safely and efficiently coordinate multiple robots to achieve cooperative assembly, e.g. LEGO assembly. In particular, APEX-MR provides a systematic approach to post-process multi-robot tasks and motion plans to enable robust asynchronous execution under uncertainty. Experimental results demonstrate that APEX-MR can significantly speed up the execution time of many long-horizon LEGO assembly tasks by 48% compared to sequential planning and 36% compared to synchronous planning on average. To further demonstrate the performance, we deploy APEX-MR to a dual-arm system to perform physical LEGO assembly. To our knowledge, this is the first robotic system capable of performing customized LEGO assembly using commercial LEGO bricks. The experiment results demonstrate that the dual-arm system, with APEX-MR, can safely coordinate robot motions, efficiently collaborate, and construct complex LEGO structures. Our project website is available at <a href="https://intelligent-control-lab.github.io/APEX-MR/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2503.15895" title="Abstract" id="2503.15895"> arXiv:2503.15895 </a> [<a href="/pdf/2503.15895" title="Download PDF" id="pdf-2503.15895" aria-labelledby="pdf-2503.15895">pdf</a>, <a href="/format/2503.15895" title="Other formats" id="oth-2503.15895" aria-labelledby="oth-2503.15895">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CONTHER: Human-Like Contextual Robot Learning via Hindsight Experience Replay and Transformers without Expert Demonstrations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Makarova,+M">Maria Makarova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qian Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsetserukou,+D">Dzmitry Tsetserukou</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to IROS 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> This paper presents CONTHER, a novel reinforcement learning algorithm designed to efficiently and rapidly train robotic agents for goal-oriented manipulation tasks and obstacle avoidance. The algorithm uses a modified replay buffer inspired by the Hindsight Experience Replay (HER) approach to artificially populate experience with successful trajectories, effectively addressing the problem of sparse reward scenarios and eliminating the need to manually collect expert demonstrations. <br>The developed algorithm proposes a Transformer-based architecture to incorporate the context of previous states, allowing the agent to perform a deeper analysis and make decisions in a manner more akin to human learning. The effectiveness of the built-in replay buffer, which acts as an "internal demonstrator", is twofold: it accelerates learning and allows the algorithm to adapt to different tasks. Empirical data confirm the superiority of the algorithm by an average of 38.46% over other considered methods, and the most successful baseline by 28.21%, showing higher success rates and faster convergence in the point-reaching task. Since the control is performed through the robot's joints, the algorithm facilitates potential adaptation to a real robot system and construction of an obstacle avoidance task. Therefore, the algorithm has also been tested on tasks requiring following a complex dynamic trajectory and obstacle avoidance. The design of the algorithm ensures its applicability to a wide range of goal-oriented tasks, making it an easily integrated solution for real-world robotics applications. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.15915" title="Abstract" id="2503.15915"> arXiv:2503.15915 </a> [<a href="/pdf/2503.15915" title="Download PDF" id="pdf-2503.15915" aria-labelledby="pdf-2503.15915">pdf</a>, <a href="https://arxiv.org/html/2503.15915v1" title="View HTML" id="html-2503.15915" aria-labelledby="html-2503.15915" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15915" title="Other formats" id="oth-2503.15915" aria-labelledby="oth-2503.15915">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Development of a Magnetorheological Hand Exoskeleton Featuring High Force-to-power Ratio for Enhancing Grip Endurance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+W">Wenbo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mai,+X">Xianlong Mai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Ying Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Systems and Control (eess.SY) </div> <p class='mathjax'> Hand exoskeletons have significant potential in labor-intensive fields by mitigating hand grip fatigue, enhancing hand strength, and preventing <a href="http://injuries.However" rel="external noopener nofollow" class="link-external link-http">this http URL</a>, most traditional hand exoskeletons are driven by motors whose output force is limited under constrained installation conditions. In addition, they also come with the disadvantages of high power consumption, complex and bulky assistive systems, and high <a href="http://instability.In" rel="external noopener nofollow" class="link-external link-http">this http URL</a> this work, we develop a novel hand exoskeleton integrated with magnetorheological (MR) clutches that offers a high force-to-power ratio to improve grip endurance. The clutch features an enhanced structure design, a micro roller enhancing structure, which can significantly boost output forces. The experimental data demonstrate that the clutch can deliver a peak holding force of 380 N with a consumption of 1.48 W, yielding a force-to-power ratio of 256.75N/W, which is 2.35 times higher than the best reported actuator used for hand exoskeletons. The designed MR hand exoskeleton is highly integrated and comprises an exoskeleton frame, MR clutches, a control unit, and a battery. Evaluations through static grip endurance tests and dynamic carrying and lifting tests confirm that the MR hand exoskeleton can effectively reduce muscle fatigue, extend grip endurance, and minimize injuries. These findings highlight its strong potential for practical applications in repetitive tasks such as carrying and lifting in industrial settings. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2503.15987" title="Abstract" id="2503.15987"> arXiv:2503.15987 </a> [<a href="/pdf/2503.15987" title="Download PDF" id="pdf-2503.15987" aria-labelledby="pdf-2503.15987">pdf</a>, <a href="https://arxiv.org/html/2503.15987v1" title="View HTML" id="html-2503.15987" aria-labelledby="html-2503.15987" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15987" title="Other formats" id="oth-2503.15987" aria-labelledby="oth-2503.15987">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Laser-guided Interaction Interface for Providing Effective Robot Assistance to People with Upper Limbs Impairments </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Torielli,+D">Davide Torielli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bertoni,+L">Liana Bertoni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muratore,+L">Luca Muratore</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsagarakis,+N">Nikos Tsagarakis</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 12 figures </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Robotics and Automation Letters, vol. 9, no. 9, pp. 7653-7660, Sept. 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> Robotics has shown significant potential in assisting people with disabilities to enhance their independence and involvement in daily activities. Indeed, a societal long-term impact is expected in home-care assistance with the deployment of intelligent robotic interfaces. This work presents a human-robot interface developed to help people with upper limbs impairments, such as those affected by stroke injuries, in activities of everyday life. The proposed interface leverages on a visual servoing guidance component, which utilizes an inexpensive but effective laser emitter device. By projecting the laser on a surface within the workspace of the robot, the user is able to guide the robotic manipulator to desired locations, to reach, grasp and manipulate objects. Considering the targeted users, the laser emitter is worn on the head, enabling to intuitively control the robot motions with head movements that point the laser in the environment, which projection is detected with a neural network based perception module. The interface implements two control modalities: the first allows the user to select specific locations directly, commanding the robot to reach those points; the second employs a paper keyboard with buttons that can be virtually pressed by pointing the laser at them. These buttons enable a more direct control of the Cartesian velocity of the end-effector and provides additional functionalities such as commanding the action of the gripper. The proposed interface is evaluated in a series of manipulation tasks involving a 6DOF assistive robot manipulator equipped with 1DOF beak-like gripper. The two interface modalities are combined to successfully accomplish tasks requiring bimanual capacity that is usually affected in people with upper limbs impairments. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2503.15998" title="Abstract" id="2503.15998"> arXiv:2503.15998 </a> [<a href="/pdf/2503.15998" title="Download PDF" id="pdf-2503.15998" aria-labelledby="pdf-2503.15998">pdf</a>, <a href="https://arxiv.org/html/2503.15998v1" title="View HTML" id="html-2503.15998" aria-labelledby="html-2503.15998" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15998" title="Other formats" id="oth-2503.15998" aria-labelledby="oth-2503.15998">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wearable Haptics for a Marionette-inspired Teleoperation of Highly Redundant Robotic Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Torielli,+D">Davide Torielli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Franco,+L">Leonardo Franco</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pozzi,+M">Maria Pozzi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muratore,+L">Luca Muratore</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Malvezzi,+M">Monica Malvezzi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsagarakis,+N">Nikos Tsagarakis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Prattichizzo,+D">Domenico Prattichizzo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 8 figures </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE International Conference on Robotics and Automation (ICRA), Yokohama, Japan, 2024, pp. 15670-15676 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> The teleoperation of complex, kinematically redundant robots with loco-manipulation capabilities represents a challenge for human operators, who have to learn how to operate the many degrees of freedom of the robot to accomplish a desired task. In this context, developing an easy-to-learn and easy-to-use human-robot interface is paramount. Recent works introduced a novel teleoperation concept, which relies on a virtual physical interaction interface between the human operator and the remote robot equivalent to a "Marionette" control, but whose feedback was limited to only visual feedback on the human side. In this paper, we propose extending the "Marionette" interface by adding a wearable haptic interface to cope with the limitations given by the previous works. Leveraging the additional haptic feedback modality, the human operator gains full sensorimotor control over the robot, and the awareness about the robot's response and interactions with the environment is greatly improved. We evaluated the proposed interface and the related teleoperation framework with naive users, assessing the teleoperation performance and the user experience with and without haptic feedback. The conducted experiments consisted in a loco-manipulation mission with the CENTAURO robot, a hybrid leg-wheel quadruped with a humanoid dual-arm upper body. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2503.16013" title="Abstract" id="2503.16013"> arXiv:2503.16013 </a> [<a href="/pdf/2503.16013" title="Download PDF" id="pdf-2503.16013" aria-labelledby="pdf-2503.16013">pdf</a>, <a href="https://arxiv.org/html/2503.16013v1" title="View HTML" id="html-2503.16013" aria-labelledby="html-2503.16013" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16013" title="Other formats" id="oth-2503.16013" aria-labelledby="oth-2503.16013">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GraspCoT: Integrating Physical Property Reasoning for 6-DoF Grasping under Flexible Language Instructions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+X">Xiaomeng Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+J">Jiajun Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=You,+G">Guoliang You</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Wei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xingchen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ji,+J">Jianmin Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yanyong Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Flexible instruction-guided 6-DoF grasping is a significant yet challenging task for real-world robotic systems. Existing methods utilize the contextual understanding capabilities of the large language models (LLMs) to establish mappings between expressions and targets, allowing robots to comprehend users' intentions in the instructions. However, the LLM's knowledge about objects' physical properties remains underexplored despite its tight relevance to grasping. In this work, we propose GraspCoT, a 6-DoF grasp detection framework that integrates a Chain-of-Thought (CoT) reasoning mechanism oriented to physical properties, guided by auxiliary question-answering (QA) tasks. Particularly, we design a set of QA templates to enable hierarchical reasoning that includes three stages: target parsing, physical property analysis, and grasp action selection. Moreover, GraspCoT presents a unified multimodal LLM architecture, which encodes multi-view observations of 3D scenes into 3D-aware visual tokens, and then jointly embeds these visual tokens with CoT-derived textual tokens within LLMs to generate grasp pose predictions. Furthermore, we present IntentGrasp, a large-scale benchmark that fills the gap in public datasets for multi-object grasp detection under diverse and indirect verbal commands. Extensive experiments on IntentGrasp demonstrate the superiority of our method, with additional validation in real-world robotic applications confirming its practicality. Codes and data will be released. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2503.16066" title="Abstract" id="2503.16066"> arXiv:2503.16066 </a> [<a href="/pdf/2503.16066" title="Download PDF" id="pdf-2503.16066" aria-labelledby="pdf-2503.16066">pdf</a>, <a href="https://arxiv.org/html/2503.16066v1" title="View HTML" id="html-2503.16066" aria-labelledby="html-2503.16066" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16066" title="Other formats" id="oth-2503.16066" aria-labelledby="oth-2503.16066">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rejecting Outliers in 2D-3D Point Correspondences from 2D Forward-Looking Sonar Observations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jiayi Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+S">Shaofeng Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qian,+J">Jingyu Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+Y">Yan Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qu,+F">Fengzhong Qu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Liuqing Yang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> Rejecting outliers before applying classical robust methods is a common approach to increase the success rate of estimation, particularly when the outlier ratio is extremely high (e.g. 90%). However, this method often relies on sensor- or task-specific characteristics, which may not be easily transferable across different scenarios. In this paper, we focus on the problem of rejecting 2D-3D point correspondence outliers from 2D forward-looking sonar (2D FLS) observations, which is one of the most popular perception device in the underwater field but has a significantly different imaging mechanism compared to widely used perspective cameras and LiDAR. We fully leverage the narrow field of view in the elevation of 2D FLS and develop two compatibility tests for different 3D point configurations: (1) In general cases, we design a pairwise length in-range test to filter out overly long or short edges formed from point sets; (2) In coplanar cases, we design a coplanarity test to check if any four correspondences are compatible under a coplanar setting. Both tests are integrated into outlier rejection pipelines, where they are followed by maximum clique searching to identify the largest consistent measurement set as inliers. Extensive simulations demonstrate that the proposed methods for general and coplanar cases perform effectively under outlier ratios of 80% and 90%, respectively. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.16127" title="Abstract" id="2503.16127"> arXiv:2503.16127 </a> [<a href="/pdf/2503.16127" title="Download PDF" id="pdf-2503.16127" aria-labelledby="pdf-2503.16127">pdf</a>, <a href="https://arxiv.org/html/2503.16127v1" title="View HTML" id="html-2503.16127" aria-labelledby="html-2503.16127" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16127" title="Other formats" id="oth-2503.16127" aria-labelledby="oth-2503.16127">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Morphology-Control Trade-Off: Insights into Soft Robotic Efficiency </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Y">Yue Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+K">Kai-feng Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Iida,+F">Fumiya Iida</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The paper is planed to be submitted to a journal </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Neural and Evolutionary Computing (cs.NE) </div> <p class='mathjax'> Soft robotics holds transformative potential for enabling adaptive and adaptable systems in dynamic environments. However, the interplay between morphological and control complexities and their collective impact on task performance remains poorly understood. Therefore, in this study, we investigate these trade-offs across tasks of differing difficulty levels using four well-used morphological complexity metrics and control complexity measured by FLOPs. We investigate how these factors jointly influence task performance by utilizing the evolutionary robot experiments. Results show that optimal performance depends on the alignment between morphology and control: simpler morphologies and lightweight controllers suffice for easier tasks, while harder tasks demand higher complexities in both dimensions. In addition, a clear trade-off between morphological and control complexities that achieve the same task performance can be observed. Moreover, we also propose a sensitivity analysis to expose the task-specific contributions of individual morphological metrics. Our study establishes a framework for investigating the relationships between morphology, control, and task performance, advancing the development of task-specific robotic designs that balance computational efficiency with adaptability. This study contributes to the practical application of soft robotics in real-world scenarios by providing actionable insights. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.16164" title="Abstract" id="2503.16164"> arXiv:2503.16164 </a> [<a href="/pdf/2503.16164" title="Download PDF" id="pdf-2503.16164" aria-labelledby="pdf-2503.16164">pdf</a>, <a href="https://arxiv.org/html/2503.16164v1" title="View HTML" id="html-2503.16164" aria-labelledby="html-2503.16164" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16164" title="Other formats" id="oth-2503.16164" aria-labelledby="oth-2503.16164">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Asymptotically Optimal Path Planning With an Approximation of the Omniscient Set </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=K%C5%99%C3%AD%C5%BE,+J">Jon谩拧 K艡铆啪</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Von%C3%A1sek,+V">Vojt臎ch Von谩sek</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 13 figures </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Robotics and Automation Letters ( Volume: 10, Issue: 4, April 2025) Page(s): 3214 - 3221 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> The asymptotically optimal version of Rapidly-exploring Random Tree (RRT*) is often used to find optimal paths in a high-dimensional configuration space. The well-known issue of RRT* is its slow convergence towards the optimal solution. A possible solution is to draw random samples only from a subset of the configuration space that is known to contain configurations that can improve the cost of the path (omniscient set). A fast convergence rate may be achieved by approximating the omniscient with a low-volume set. In this letter, we propose new methods to approximate the omniscient set and methods for their effective sampling. First, we propose to approximate the omniscient set using several (small) hyperellipsoids defined by sections of the current best solution. The second approach approximates the omniscient set by a convex hull computed from the current solution. Both approaches ensure asymptotical optimality and work in a general n-dimensional configuration space. The experiments have shown superior performance of our approaches in multiple scenarios in 3D and 6D configuration spaces. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2503.16197" title="Abstract" id="2503.16197"> arXiv:2503.16197 </a> [<a href="/pdf/2503.16197" title="Download PDF" id="pdf-2503.16197" aria-labelledby="pdf-2503.16197">pdf</a>, <a href="https://arxiv.org/html/2503.16197v1" title="View HTML" id="html-2503.16197" aria-labelledby="html-2503.16197" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16197" title="Other formats" id="oth-2503.16197" aria-labelledby="oth-2503.16197">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Explosive Jumping with Rigid and Articulated Soft Quadrupeds via Example Guided Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Apostolides,+G">Georgios Apostolides</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+W">Wei Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kober,+J">Jens Kober</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Della+Santina,+C">Cosimo Della Santina</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+J">Jiatao Ding</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 9 figures, submitted to IROS2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> Achieving controlled jumping behaviour for a quadruped robot is a challenging task, especially when introducing passive compliance in mechanical design. This study addresses this challenge via imitation-based deep reinforcement learning with a progressive training process. To start, we learn the jumping skill by mimicking a coarse jumping example generated by model-based trajectory optimization. Subsequently, we generalize the learned policy to broader situations, including various distances in both forward and lateral directions, and then pursue robust jumping in unknown ground unevenness. In addition, without tuning the reward much, we learn the jumping policy for a quadruped with parallel elasticity. Results show that using the proposed method, i) the robot learns versatile jumps by learning only from a single demonstration, ii) the robot with parallel compliance reduces the landing error by 11.1%, saves energy cost by 15.2% and reduces the peak torque by 15.8%, compared to the rigid robot without parallel elasticity, iii) the robot can perform jumps of variable distances with robustness against ground unevenness (maximal 4cm height perturbations) using only proprioceptive perception. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2503.16275" title="Abstract" id="2503.16275"> arXiv:2503.16275 </a> [<a href="/pdf/2503.16275" title="Download PDF" id="pdf-2503.16275" aria-labelledby="pdf-2503.16275">pdf</a>, <a href="https://arxiv.org/html/2503.16275v1" title="View HTML" id="html-2503.16275" aria-labelledby="html-2503.16275" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16275" title="Other formats" id="oth-2503.16275" aria-labelledby="oth-2503.16275">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Loop Closure from Two Views: Revisiting PGO for Scalable Trajectory Estimation through Monocular Priors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+T+Y">Tian Yi Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+B">Boyang Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pollefeys,+M">Marc Pollefeys</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Blum,+H">Hermann Blum</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> (Visual) Simultaneous Localization and Mapping (SLAM) remains a fundamental challenge in enabling autonomous systems to navigate and understand large-scale environments. Traditional SLAM approaches struggle to balance efficiency and accuracy, particularly in large-scale settings where extensive computational resources are required for scene reconstruction and Bundle Adjustment (BA). However, this scene reconstruction, in the form of sparse pointclouds of visual landmarks, is often only used within the SLAM system because navigation and planning methods require different map representations. In this work, we therefore investigate a more scalable Visual SLAM (VSLAM) approach without reconstruction, mainly based on approaches for two-view loop closures. By restricting the map to a sparse keyframed pose graph without dense geometry representations, our '2GO' system achieves efficient optimization with competitive absolute trajectory accuracy. In particular, we find that recent advancements in image matching and monocular depth priors enable very accurate trajectory optimization from two-view edges. We conduct extensive experiments on diverse datasets, including large-scale scenarios, and provide a detailed analysis of the trade-offs between runtime, accuracy, and map size. Our results demonstrate that this streamlined approach supports real-time performance, scales well in map size and trajectory duration, and effectively broadens the capabilities of VSLAM for long-duration deployments to large environments. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.16310" title="Abstract" id="2503.16310"> arXiv:2503.16310 </a> [<a href="/pdf/2503.16310" title="Download PDF" id="pdf-2503.16310" aria-labelledby="pdf-2503.16310">pdf</a>, <a href="https://arxiv.org/html/2503.16310v1" title="View HTML" id="html-2503.16310" aria-labelledby="html-2503.16310" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16310" title="Other formats" id="oth-2503.16310" aria-labelledby="oth-2503.16310">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Can Real-to-Sim Approaches Capture Dynamic Fabric Behavior for Robotic Fabric Manipulation? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ru,+Y">Yingdong Ru</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhuang,+L">Lipeng Zhuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+Z">Zhuo He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Audonnet,+F+P">Florent P. Audonnet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aragon-Caramasa,+G">Gerardo Aragon-Caramasa</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> This paper presents a rigorous evaluation of Real-to-Sim parameter estimation approaches for fabric manipulation in robotics. The study systematically assesses three state-of-the-art approaches, namely two differential pipelines and a data-driven approach. We also devise a novel physics-informed neural network approach for physics parameter estimation. These approaches are interfaced with two simulations across multiple Real-to-Sim scenarios (lifting, wind blowing, and stretching) for five different fabric types and evaluated on three unseen scenarios (folding, fling, and shaking). We found that the simulation engines and the choice of Real-to-Sim approaches significantly impact fabric manipulation performance in our evaluation scenarios. Moreover, PINN observes superior performance in quasi-static tasks but shows limitations in dynamic scenarios. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2503.16408" title="Abstract" id="2503.16408"> arXiv:2503.16408 </a> [<a href="/pdf/2503.16408" title="Download PDF" id="pdf-2503.16408" aria-labelledby="pdf-2503.16408">pdf</a>, <a href="/format/2503.16408" title="Other formats" id="oth-2503.16408" aria-labelledby="oth-2503.16408">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RoboFactory: Exploring Embodied Agent Collaboration with Compositional Constraints </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Y">Yiran Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+L">Li Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+X">Xiufeng Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yin,+Z">Zhenfei Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaohong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xihui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+R">Ruimao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+L">Lei Bai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project page: <a href="https://iranqin.github.io/robofactory/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Designing effective embodied multi-agent systems is critical for solving complex real-world tasks across domains. Due to the complexity of multi-agent embodied systems, existing methods fail to automatically generate safe and efficient training data for such systems. To this end, we propose the concept of compositional constraints for embodied multi-agent systems, addressing the challenges arising from collaboration among embodied agents. We design various interfaces tailored to different types of constraints, enabling seamless interaction with the physical world. Leveraging compositional constraints and specifically designed interfaces, we develop an automated data collection framework for embodied multi-agent systems and introduce the first benchmark for embodied multi-agent manipulation, RoboFactory. Based on RoboFactory benchmark, we adapt and evaluate the method of imitation learning and analyzed its performance in different difficulty agent tasks. Furthermore, we explore the architectures and training strategies for multi-agent imitation learning, aiming to build safe and efficient embodied multi-agent systems. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 16 of 16 entries)</h3> <dt> <a name='item22'>[22]</a> <a href ="/abs/2503.15491" title="Abstract" id="2503.15491"> arXiv:2503.15491 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15491" title="Download PDF" id="pdf-2503.15491" aria-labelledby="pdf-2503.15491">pdf</a>, <a href="https://arxiv.org/html/2503.15491v1" title="View HTML" id="html-2503.15491" aria-labelledby="html-2503.15491" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15491" title="Other formats" id="oth-2503.15491" aria-labelledby="oth-2503.15491">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Agreeing to Interact in Human-Robot Interaction using Large Language Models and Vision Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sasabuchi,+K">Kazuhiro Sasabuchi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wake,+N">Naoki Wake</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kanehira,+A">Atsushi Kanehira</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Takamatsu,+J">Jun Takamatsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ikeuchi,+K">Katsushi Ikeuchi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> In human-robot interaction (HRI), the beginning of an interaction is often complex. Whether the robot should communicate with the human is dependent on several situational factors (e.g., the current human's activity, urgency of the interaction, etc.). We test whether large language models (LLM) and vision language models (VLM) can provide solutions to this problem. We compare four different system-design patterns using LLMs and VLMs, and test on a test set containing 84 human-robot situations. The test set mixes several publicly available datasets and also includes situations where the appropriate action to take is open-ended. Our results using the GPT-4o and Phi-3 Vision model indicate that LLMs and VLMs are capable of handling interaction beginnings when the desired actions are clear, however, challenge remains in the open-ended situations where the model must balance between the human and robot situation. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2503.15496" title="Abstract" id="2503.15496"> arXiv:2503.15496 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15496" title="Download PDF" id="pdf-2503.15496" aria-labelledby="pdf-2503.15496">pdf</a>, <a href="https://arxiv.org/html/2503.15496v1" title="View HTML" id="html-2503.15496" aria-labelledby="html-2503.15496" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15496" title="Other formats" id="oth-2503.15496" aria-labelledby="oth-2503.15496">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Fast Multi-Party Open-Ended Conversation with a Social Robot </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Abbo,+G+A">Giulio Antonio Abbo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pinto-Bernal,+M+J">Maria Jose Pinto-Bernal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Catrycke,+M">Martijn Catrycke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belpaeme,+T">Tony Belpaeme</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 6 figures, 1 appendix </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Robotics (cs.RO) </div> <p class='mathjax'> This paper presents the implementation and evaluation of a conversational agent designed for multi-party open-ended interactions. Leveraging state-of-the-art technologies such as voice direction of arrival, voice recognition, face tracking, and large language models, the system aims to facilitate natural and intuitive human-robot conversations. Deployed on the Furhat robot, the system was tested with 30 participants engaging in open-ended group conversations and then in two overlapping discussions. Quantitative metrics, such as latencies and recognition accuracy, along with qualitative measures from user questionnaires, were collected to assess performance. The results highlight the system's effectiveness in managing multi-party interactions, though improvements are needed in response relevance and latency. This study contributes valuable insights for advancing human-robot interaction, particularly in enhancing the naturalness and engagement in group conversations. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2503.15500" title="Abstract" id="2503.15500"> arXiv:2503.15500 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15500" title="Download PDF" id="pdf-2503.15500" aria-labelledby="pdf-2503.15500">pdf</a>, <a href="https://arxiv.org/html/2503.15500v1" title="View HTML" id="html-2503.15500" aria-labelledby="html-2503.15500" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15500" title="Other formats" id="oth-2503.15500" aria-labelledby="oth-2503.15500">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ImageInThat: Manipulating Images to Convey User Instructions to Robots </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mahadevan,+K">Karthik Mahadevan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lewis,+B">Blaine Lewis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jiannan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mutlu,+B">Bilge Mutlu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+A">Anthony Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Grossman,+T">Tovi Grossman</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In Proceedings of the ACM/IEEE International Conference on Human-Robot Interaction (HRI), 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Foundation models are rapidly improving the capability of robots in performing everyday tasks autonomously such as meal preparation, yet robots will still need to be instructed by humans due to model performance, the difficulty of capturing user preferences, and the need for user agency. Robots can be instructed using various methods-natural language conveys immediate instructions but can be abstract or ambiguous, whereas end-user programming supports longer horizon tasks but interfaces face difficulties in capturing user intent. In this work, we propose using direct manipulation of images as an alternative paradigm to instruct robots, and introduce a specific instantiation called ImageInThat which allows users to perform direct manipulation on images in a timeline-style interface to generate robot instructions. Through a user study, we demonstrate the efficacy of ImageInThat to instruct robots in kitchen manipulation tasks, comparing it to a text-based natural language instruction method. The results show that participants were faster with ImageInThat and preferred to use it over the text-based method. Supplementary material including code can be found at: <a href="https://image-in-that.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2503.15503" title="Abstract" id="2503.15503"> arXiv:2503.15503 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15503" title="Download PDF" id="pdf-2503.15503" aria-labelledby="pdf-2503.15503">pdf</a>, <a href="/format/2503.15503" title="Other formats" id="oth-2503.15503" aria-labelledby="oth-2503.15503">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Impact of Extended Reality on Robot-Assisted Surgery Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bickford,+M">Michael Bickford</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alruwaili,+F">Fayez Alruwaili</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ragab,+S">Sara Ragab</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rothenberg,+H">Hanna Rothenberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abedin-Nasab,+M">Mohammad Abedin-Nasab</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The article is under review for publication </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Robot Assisted Surgeries (RAS) have one of the steepest learning curves of any type of surgery. Because of this, methods to practice RAS outside the operating room have been developed to improve the surgeons skills. These strategies include the incorporation of extended reality simulators into surgical training programs. In this Systematic review, we seek to determine if extended reality simulators can improve the performance of novice surgeons and how their performance compares to the conventional training of surgeons on Surgical robots. Using the PRISMA 2020 guidelines, a systematic review and meta-analysis was performed searching PubMed, Embase, Web of Science, and Cochrane library for studies that compared the performance of novice surgeons that received no additional training, trained with extended reality, or trained with inanimate physical simulators (conventional additional training). We included articles that gauged performance using either GEARS or Time to complete measurements and used SPSS to perform a meta-analysis to compare the performance outcomes of the surgeons after training. Surgeons trained using extended reality completed their surgical tasks statistically significantly faster than those who did not receive training (Cohen's d=-0.95, p=0.02), and moderately slower than those conventionally trained (Cohen's d=0.65, p=0.14). However, this difference was not statistically significant. Surgeons trained on extended reality demonstrated a statistically significant improvement in GEARS scores over those who did not train (Cohen's d=0.964, p<0.001). While surgeons trained in extended reality had comparable GEARS scores to surgeons trained conventionally (Cohen's d=0.65, p=0.14). This meta-analysis demonstrates that extended reality simulators translated complex skills to surgeons in a low cost and low risk environment. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2503.15504" title="Abstract" id="2503.15504"> arXiv:2503.15504 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15504" title="Download PDF" id="pdf-2503.15504" aria-labelledby="pdf-2503.15504">pdf</a>, <a href="/format/2503.15504" title="Other formats" id="oth-2503.15504" aria-labelledby="oth-2503.15504">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GRETA: Modular Platform to Create Adaptive Socially Interactive Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Grimaldi,+M">Michele Grimaldi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Woo,+J">Jieyeon Woo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Boucaud,+F">Fabien Boucaud</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galland,+L">Lucie Galland</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Younsi,+N">Nezih Younsi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Liu Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fares,+M">Mireille Fares</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Graux,+S">Sean Graux</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gauthier,+P">Philippe Gauthier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pelachaud,+C">Catherine Pelachaud</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Robotics (cs.RO) </div> <p class='mathjax'> The interaction between humans is very complex to describe since it is composed of different elements from different modalities such as speech, gaze, and gestures influenced by social attitudes and emotions. Furthermore, the interaction can be affected by some features which refer to the interlocutor's state. Actual Socially Interactive Agents SIAs aim to adapt themselves to the state of the interaction partner. In this paper, we discuss this adaptation by describing the architecture of the GRETA platform which considers external features while interacting with humans and/or another ECA and process the dialogue incrementally. We illustrate the new architecture of GRETA which deals with the external features, the adaptation, and the incremental approach for the dialogue processing. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2503.15510" title="Abstract" id="2503.15510"> arXiv:2503.15510 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15510" title="Download PDF" id="pdf-2503.15510" aria-labelledby="pdf-2503.15510">pdf</a>, <a href="https://arxiv.org/html/2503.15510v1" title="View HTML" id="html-2503.15510" aria-labelledby="html-2503.15510" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15510" title="Other formats" id="oth-2503.15510" aria-labelledby="oth-2503.15510">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Joint Decision-Making in Robot Teleoperation: When are Two Heads Better Than One? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+D">Duc-An Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bhattacharyya,+R">Raunak Bhattacharyya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Colombatto,+C">Clara Colombatto</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fleming,+S">Steve Fleming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Posner,+I">Ingmar Posner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hawes,+N">Nick Hawes</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To be published in the 2025 20th ACM/IEEE International Conference on Human-Robot Interaction (HRI) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Operators working with robots in safety-critical domains have to make decisions under uncertainty, which remains a challenging problem for a single human operator. An open question is whether two human operators can make better decisions jointly, as compared to a single operator alone. While prior work has shown that two heads are better than one, such studies have been mostly limited to static and passive tasks. We investigate joint decision-making in a dynamic task involving humans teleoperating robots. We conduct a human-subject experiment with $N=100$ participants where each participant performed a navigation task with two mobiles robots in simulation. We find that joint decision-making through confidence sharing improves dyad performance beyond the better-performing individual (p<0.0001). Further, we find that the extent of this benefit is regulated both by the skill level of each individual, as well as how well-calibrated their confidence estimates are. Finally, we present findings on characterising the human-human dyad's confidence calibration based on the individuals constituting the dyad. Our findings demonstrate for the first time that two heads are better than one, even on a spatiotemporal task which includes active operator control of robots. </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2503.15557" title="Abstract" id="2503.15557"> arXiv:2503.15557 </a> (cross-list from cs.GR) [<a href="/pdf/2503.15557" title="Download PDF" id="pdf-2503.15557" aria-labelledby="pdf-2503.15557">pdf</a>, <a href="https://arxiv.org/html/2503.15557v1" title="View HTML" id="html-2503.15557" aria-labelledby="html-2503.15557" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15557" title="Other formats" id="oth-2503.15557" aria-labelledby="oth-2503.15557">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Motion Synthesis with Sparse and Flexible Keyjoint Control </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hwang,+I">Inwoo Hwang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bae,+J">Jinseok Bae</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+D">Donggeun Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+Y+M">Young Min Kim</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, Project Page: <a href="http://inwoohwang.me/SFControl" rel="external noopener nofollow" class="link-external link-http">this http URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV); Robotics (cs.RO) </div> <p class='mathjax'> Creating expressive character animations is labor-intensive, requiring intricate manual adjustment of animators across space and time. Previous works on controllable motion generation often rely on a predefined set of dense spatio-temporal specifications (e.g., dense pelvis trajectories with exact per-frame timing), limiting practicality for animators. To process high-level intent and intuitive control in diverse scenarios, we propose a practical controllable motions synthesis framework that respects sparse and flexible keyjoint signals. Our approach employs a decomposed diffusion-based motion synthesis framework that first synthesizes keyjoint movements from sparse input control signals and then synthesizes full-body motion based on the completed keyjoint trajectories. The low-dimensional keyjoint movements can easily adapt to various control signal types, such as end-effector position for diverse goal-driven motion synthesis, or incorporate functional constraints on a subset of keyjoints. Additionally, we introduce a time-agnostic control formulation, eliminating the need for frame-specific timing annotations and enhancing control flexibility. Then, the shared second stage can synthesize a natural whole-body motion that precisely satisfies the task requirement from dense keyjoint movements. We demonstrate the effectiveness of sparse and flexible keyjoint control through comprehensive experiments on diverse datasets and scenarios. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2503.15558" title="Abstract" id="2503.15558"> arXiv:2503.15558 </a> (cross-list from cs.AI) [<a href="/pdf/2503.15558" title="Download PDF" id="pdf-2503.15558" aria-labelledby="pdf-2503.15558">pdf</a>, <a href="https://arxiv.org/html/2503.15558v1" title="View HTML" id="html-2503.15558" aria-labelledby="html-2503.15558" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15558" title="Other formats" id="oth-2503.15558" aria-labelledby="oth-2503.15558">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=NVIDIA">NVIDIA</a>: <a href="https://arxiv.org/search/cs?searchtype=author&query=Azzolini,+A">Alisson Azzolini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brandon,+H">Hannah Brandon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chattopadhyay,+P">Prithvijit Chattopadhyay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Huayu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+J">Jinju Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+Y">Yin Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Diamond,+J">Jenna Diamond</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+Y">Yifan Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ferroni,+F">Francesco Ferroni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Govindaraju,+R">Rama Govindaraju</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jinwei Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gururani,+S">Siddharth Gururani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hanafi,+I+E">Imad El Hanafi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Z">Zekun Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huffman,+J">Jacob Huffman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+J">Jingyi Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Johnson,+B">Brendan Johnson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khan,+R">Rizwan Khan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kurian,+G">George Kurian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lantz,+E">Elena Lantz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+N">Nayeon Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhaoshuo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+T">Tsung-Yi Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yen-Chen Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+M">Ming-Yu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mathau,+A">Andrew Mathau</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+Y">Yun Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pavao,+L">Lindsey Pavao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ping,+W">Wei Ping</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Romero,+D+W">David W. Romero</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Smelyanskiy,+M">Misha Smelyanskiy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+S">Shuran Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tchapmi,+L">Lyne Tchapmi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+A+Z">Andrew Z. Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Boxin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haoxiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+F">Fangyin Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jiashu Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xiaodong Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zhuolin Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+X">Xiaohui Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhe Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> Physical AI systems need to perceive, understand, and perform complex actions in the physical world. In this paper, we present the Cosmos-Reason1 models that can understand the physical world and generate appropriate embodied decisions (e.g., next step action) in natural language through long chain-of-thought reasoning processes. We begin by defining key capabilities for Physical AI reasoning, with a focus on physical common sense and embodied reasoning. To represent physical common sense, we use a hierarchical ontology that captures fundamental knowledge about space, time, and physics. For embodied reasoning, we rely on a two-dimensional ontology that generalizes across different physical embodiments. Building on these capabilities, we develop two multimodal large language models, Cosmos-Reason1-8B and Cosmos-Reason1-56B. We curate data and train our models in four stages: vision pre-training, general supervised fine-tuning (SFT), Physical AI SFT, and Physical AI reinforcement learning (RL) as the post-training. To evaluate our models, we build comprehensive benchmarks for physical common sense and embodied reasoning according to our ontologies. Evaluation results show that Physical AI SFT and reinforcement learning bring significant improvements. To facilitate the development of Physical AI, we will make our code and pre-trained models available under the NVIDIA Open Model License at <a href="https://github.com/nvidia-cosmos/cosmos-reason1" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2503.15615" title="Abstract" id="2503.15615"> arXiv:2503.15615 </a> (cross-list from cs.LG) [<a href="/pdf/2503.15615" title="Download PDF" id="pdf-2503.15615" aria-labelledby="pdf-2503.15615">pdf</a>, <a href="https://arxiv.org/html/2503.15615v1" title="View HTML" id="html-2503.15615" aria-labelledby="html-2503.15615" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15615" title="Other formats" id="oth-2503.15615" aria-labelledby="oth-2503.15615">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PEnGUiN: Partially Equivariant Graph NeUral Networks for Sample Efficient MARL </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=McClellan,+J">Joshua McClellan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brothers,+G">Greyson Brothers</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+F">Furong Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tokekar,+P">Pratap Tokekar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Robotics (cs.RO) </div> <p class='mathjax'> Equivariant Graph Neural Networks (EGNNs) have emerged as a promising approach in Multi-Agent Reinforcement Learning (MARL), leveraging symmetry guarantees to greatly improve sample efficiency and generalization. However, real-world environments often exhibit inherent asymmetries arising from factors such as external forces, measurement inaccuracies, or intrinsic system biases. This paper introduces \textit{Partially Equivariant Graph NeUral Networks (PEnGUiN)}, a novel architecture specifically designed to address these challenges. We formally identify and categorize various types of partial equivariance relevant to MARL, including subgroup equivariance, feature-wise equivariance, regional equivariance, and approximate equivariance. We theoretically demonstrate that PEnGUiN is capable of learning both fully equivariant (EGNN) and non-equivariant (GNN) representations within a unified framework. Through extensive experiments on a range of MARL problems incorporating various asymmetries, we empirically validate the efficacy of PEnGUiN. Our results consistently demonstrate that PEnGUiN outperforms both EGNNs and standard GNNs in asymmetric environments, highlighting their potential to improve the robustness and applicability of graph-based MARL algorithms in real-world scenarios. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2503.15672" title="Abstract" id="2503.15672"> arXiv:2503.15672 </a> (cross-list from cs.CV) [<a href="/pdf/2503.15672" title="Download PDF" id="pdf-2503.15672" aria-labelledby="pdf-2503.15672">pdf</a>, <a href="https://arxiv.org/html/2503.15672v1" title="View HTML" id="html-2503.15672" aria-labelledby="html-2503.15672" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15672" title="Other formats" id="oth-2503.15672" aria-labelledby="oth-2503.15672">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GASP: Unifying Geometric and Semantic Self-Supervised Pre-training for Autonomous Driving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ljungbergh,+W">William Ljungbergh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lilja,+A">Adam Lilja</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ling,+A+T+A+L">Adam Tonderski. Arvid Laveno Ling</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lindstr%C3%B6m,+C">Carl Lindstr枚m</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Verbeke,+W">Willem Verbeke</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+J">Junsheng Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Petersson,+C">Christoffer Petersson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hammarstrand,+L">Lars Hammarstrand</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Felsberg,+M">Michael Felsberg</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Self-supervised pre-training based on next-token prediction has enabled large language models to capture the underlying structure of text, and has led to unprecedented performance on a large array of tasks when applied at scale. Similarly, autonomous driving generates vast amounts of spatiotemporal data, alluding to the possibility of harnessing scale to learn the underlying geometric and semantic structure of the environment and its evolution over time. In this direction, we propose a geometric and semantic self-supervised pre-training method, GASP, that learns a unified representation by predicting, at any queried future point in spacetime, (1) general occupancy, capturing the evolving structure of the 3D scene; (2) ego occupancy, modeling the ego vehicle path through the environment; and (3) distilled high-level features from a vision foundation model. By modeling geometric and semantic 4D occupancy fields instead of raw sensor measurements, the model learns a structured, generalizable representation of the environment and its evolution through time. We validate GASP on multiple autonomous driving benchmarks, demonstrating significant improvements in semantic occupancy forecasting, online mapping, and ego trajectory prediction. Our results demonstrate that continuous 4D geometric and semantic occupancy prediction provides a scalable and effective pre-training paradigm for autonomous driving. For code and additional visualizations, see \href{<a href="https://research.zenseact.com/publications/gasp/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2503.15778" title="Abstract" id="2503.15778"> arXiv:2503.15778 </a> (cross-list from cs.CV) [<a href="/pdf/2503.15778" title="Download PDF" id="pdf-2503.15778" aria-labelledby="pdf-2503.15778">pdf</a>, <a href="https://arxiv.org/html/2503.15778v1" title="View HTML" id="html-2503.15778" aria-labelledby="html-2503.15778" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15778" title="Other formats" id="oth-2503.15778" aria-labelledby="oth-2503.15778">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AutoDrive-QA- Automated Generation of Multiple-Choice Questions for Autonomous Driving Datasets Using Large Vision-Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khalili,+B">Boshra Khalili</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=W.Smyth,+A">Andrew W.Smyth</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Robotics (cs.RO) </div> <p class='mathjax'> In autonomous driving, open-ended question answering often suffers from unreliable evaluations because freeform responses require either complex metrics or subjective human judgment. To address this challenge, we introduce AutoDrive-QA, an automatic pipeline that converts existing driving QA datasets (including DriveLM, NuScenes-QA, and LingoQA) into a structured multiple-choice question (MCQ) format. This benchmark systematically assesses perception, prediction, and planning tasks, providing a standardized and objective evaluation framework. AutoDrive-QA employs an automated pipeline that leverages large language models (LLMs) to generate high-quality, contextually relevant distractors based on domain-specific error patterns commonly found in autonomous driving scenarios. To evaluate both general capabilities and generalization performance, we test the benchmark on three public datasets and conduct zero-shot experiments on an unseen dataset. The zero-shot evaluations reveal that GPT-4V leads with 69.57% accuracy -- achieving 74.94% in Perception, 65.33% in Prediction, and 68.45% in Planning -- demonstrating that while all models excel in Perception, they struggle in Prediction. Consequently, AutoDrive-QA establishes a rigorous, unbiased standard for integrating and evaluating different vision-language models across various autonomous driving datasets, thereby improving generalization in this field. We release all the codes in the AutoDrive-QA GitHub Repository. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2503.16216" title="Abstract" id="2503.16216"> arXiv:2503.16216 </a> (cross-list from cs.DC) [<a href="/pdf/2503.16216" title="Download PDF" id="pdf-2503.16216" aria-labelledby="pdf-2503.16216">pdf</a>, <a href="https://arxiv.org/html/2503.16216v1" title="View HTML" id="html-2503.16216" aria-labelledby="html-2503.16216" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16216" title="Other formats" id="oth-2503.16216" aria-labelledby="oth-2503.16216">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dispersion is (Almost) Optimal under (A)synchrony </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kshemkalyani,+A+D">Ajay D. Kshemkalyani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+M">Manish Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Molla,+A+R">Anisur Rahaman Molla</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sharma,+G">Gokarna Sharma</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 24 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Distributed, Parallel, and Cluster Computing (cs.DC)</span>; Data Structures and Algorithms (cs.DS); Multiagent Systems (cs.MA); Robotics (cs.RO) </div> <p class='mathjax'> The dispersion problem has received much attention recently in the distributed computing literature. In this problem, $k\leq n$ agents placed initially arbitrarily on the nodes of an $n$-node, $m$-edge anonymous graph of maximum degree $\Delta$ have to reposition autonomously to reach a configuration in which each agent is on a distinct node of the graph. Dispersion is interesting as well as important due to its connections to many fundamental coordination problems by mobile agents on graphs, such as exploration, scattering, load balancing, relocation of self-driven electric cars (robots) to recharge stations (nodes), etc. The objective has been to provide a solution that optimizes simultaneously time and memory complexities. There exist graphs for which the lower bound on time complexity is $\Omega(k)$. Memory complexity is $\Omega(\log k)$ per agent independent of graph topology. The state-of-the-art algorithms have (i) time complexity $O(k\log^2k)$ and memory complexity $O(\log(k+\Delta))$ under the synchronous setting [DISC'24] and (ii) time complexity $O(\min\{m,k\Delta\})$ and memory complexity $O(\log(k+\Delta))$ under the asynchronous setting [OPODIS'21]. In this paper, we improve substantially on this state-of-the-art. Under the synchronous setting as in [DISC'24], we present the first optimal $O(k)$ time algorithm keeping memory complexity $O(\log (k+\Delta))$. Under the asynchronous setting as in [OPODIS'21], we present the first algorithm with time complexity $O(k\log k)$ keeping memory complexity $O(\log (k+\Delta))$, which is time-optimal within an $O(\log k)$ factor despite asynchrony. Both results were obtained through novel techniques to quickly find empty nodes to settle agents, which may be of independent interest. </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2503.16263" title="Abstract" id="2503.16263"> arXiv:2503.16263 </a> (cross-list from cs.CV) [<a href="/pdf/2503.16263" title="Download PDF" id="pdf-2503.16263" aria-labelledby="pdf-2503.16263">pdf</a>, <a href="https://arxiv.org/html/2503.16263v1" title="View HTML" id="html-2503.16263" aria-labelledby="html-2503.16263" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16263" title="Other formats" id="oth-2503.16263" aria-labelledby="oth-2503.16263">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Monocular Vision to Autonomous Action: Guiding Tumor Resection via 3D Reconstruction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Acar,+A">Ayberk Acar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Smith,+M">Mariana Smith</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Al-Zogbi,+L">Lidia Al-Zogbi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Watts,+T">Tanner Watts</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+F">Fangjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Hao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yilmaz,+N">Nural Yilmaz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Scheikl,+P+M">Paul Maria Scheikl</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=d'Almeida,+J+F">Jesse F. d'Almeida</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sharma,+S">Susheela Sharma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Branscombe,+L">Lauren Branscombe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ertop,+T+E">Tayfun Efe Ertop</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Webster,+R+J">Robert J. Webster III</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oguz,+I">Ipek Oguz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kuntz,+A">Alan Kuntz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krieger,+A">Axel Krieger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J+Y">Jie Ying Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 Pages, 8 Figures, 1 Table. This work has been submitted IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) for possible publication </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Surgical automation requires precise guidance and understanding of the scene. Current methods in the literature rely on bulky depth cameras to create maps of the anatomy, however this does not translate well to space-limited clinical applications. Monocular cameras are small and allow minimally invasive surgeries in tight spaces but additional processing is required to generate 3D scene understanding. We propose a 3D mapping pipeline that uses only RGB images to create segmented point clouds of the target anatomy. To ensure the most precise reconstruction, we compare different structure from motion algorithms' performance on mapping the central airway obstructions, and test the pipeline on a downstream task of tumor resection. In several metrics, including post-procedure tissue model evaluation, our pipeline performs comparably to RGB-D cameras and, in some cases, even surpasses their performance. These promising results demonstrate that automation guidance can be achieved in minimally invasive procedures with monocular cameras. This study is a step toward the complete autonomy of surgical robots. </p> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2503.16340" title="Abstract" id="2503.16340"> arXiv:2503.16340 </a> (cross-list from cs.LG) [<a href="/pdf/2503.16340" title="Download PDF" id="pdf-2503.16340" aria-labelledby="pdf-2503.16340">pdf</a>, <a href="https://arxiv.org/html/2503.16340v1" title="View HTML" id="html-2503.16340" aria-labelledby="html-2503.16340" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16340" title="Other formats" id="oth-2503.16340" aria-labelledby="oth-2503.16340">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Nonlinear action prediction models reveal multi-timescale locomotor control </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wei-Chen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=De+Comite,+A">Antoine De Comite</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Daley,+M">Monica Daley</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Voloshina,+A">Alexandra Voloshina</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seethapathi,+N">Nidhi Seethapathi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Modeling movement in real-world tasks is a fundamental scientific goal. However, it is unclear whether existing models and their assumptions, overwhelmingly tested in laboratory-constrained settings, generalize to the real world. For example, data-driven models of foot placement control -- a crucial action for stable locomotion -- assume linear and single timescale mappings. We develop nonlinear foot placement prediction models, finding that neural network architectures with flexible input history-dependence like GRU and Transformer perform best across multiple contexts (walking and running, treadmill and overground, varying terrains) and input modalities (multiple body states, gaze), outperforming traditional models. These models reveal context- and modality-dependent timescales: there is more reliance on fast-timescale predictions in complex terrain, gaze predictions precede body state predictions, and full-body state predictions precede center-of-mass-relevant predictions. Thus, nonlinear action prediction models provide quantifiable insights into real-world motor control and can be extended to other actions, contexts, and populations. </p> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2503.16394" title="Abstract" id="2503.16394"> arXiv:2503.16394 </a> (cross-list from cs.CV) [<a href="/pdf/2503.16394" title="Download PDF" id="pdf-2503.16394" aria-labelledby="pdf-2503.16394">pdf</a>, <a href="https://arxiv.org/html/2503.16394v1" title="View HTML" id="html-2503.16394" aria-labelledby="html-2503.16394" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16394" title="Other formats" id="oth-2503.16394" aria-labelledby="oth-2503.16394">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Do Visual Imaginations Improve Vision-and-Language Navigation Agents? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Perincherry,+A">Akhil Perincherry</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krantz,+J">Jacob Krantz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+S">Stefan Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Robotics (cs.RO) </div> <p class='mathjax'> Vision-and-Language Navigation (VLN) agents are tasked with navigating an unseen environment using natural language instructions. In this work, we study if visual representations of sub-goals implied by the instructions can serve as navigational cues and lead to increased navigation performance. To synthesize these visual representations or imaginations, we leverage a text-to-image diffusion model on landmark references contained in segmented instructions. These imaginations are provided to VLN agents as an added modality to act as landmark cues and an auxiliary loss is added to explicitly encourage relating these with their corresponding referring expressions. Our findings reveal an increase in success rate (SR) of around 1 point and up to 0.5 points in success scaled by inverse path length (SPL) across agents. These results suggest that the proposed approach reinforces visual understanding compared to relying on language instructions alone. Code and data for our work can be found at <a href="https://www.akhilperincherry.com/VLN-Imagine-website/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2503.16413" title="Abstract" id="2503.16413"> arXiv:2503.16413 </a> (cross-list from cs.CV) [<a href="/pdf/2503.16413" title="Download PDF" id="pdf-2503.16413" aria-labelledby="pdf-2503.16413">pdf</a>, <a href="https://arxiv.org/html/2503.16413v1" title="View HTML" id="html-2503.16413" aria-labelledby="html-2503.16413" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16413" title="Other formats" id="oth-2503.16413" aria-labelledby="oth-2503.16413">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> M3: 3D-Spatial MultiModal Memory </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+X">Xueyan Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yuchen Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+R">Ri-Zhao Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+X">Xuanbin Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jianglong Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Sifei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiaolong Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICLR2025 homepage: <a href="https://m3-spatial-memory.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> code: <a href="https://github.com/MaureenZOU/m3-spatial" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Robotics (cs.RO) </div> <p class='mathjax'> We present 3D Spatial MultiModal Memory (M3), a multimodal memory system designed to retain information about medium-sized static scenes through video sources for visual perception. By integrating 3D Gaussian Splatting techniques with foundation models, M3 builds a multimodal memory capable of rendering feature representations across granularities, encompassing a wide range of knowledge. In our exploration, we identify two key challenges in previous works on feature splatting: (1) computational constraints in storing high-dimensional features for each Gaussian primitive, and (2) misalignment or information loss between distilled features and foundation model features. To address these challenges, we propose M3 with key components of principal scene components and Gaussian memory attention, enabling efficient training and inference. To validate M3, we conduct comprehensive quantitative evaluations of feature similarity and downstream tasks, as well as qualitative visualizations to highlight the pixel trace of Gaussian memory attention. Our approach encompasses a diverse range of foundation models, including vision-language models (VLMs), perception models, and large multimodal and language models (LMMs/LLMs). Furthermore, to demonstrate real-world applicability, we deploy M3's feature field in indoor scenes on a quadruped robot. Notably, we claim that M3 is the first work to address the core compression challenges in 3D feature distillation. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 21 of 21 entries)</h3> <dt> <a name='item38'>[38]</a> <a href ="/abs/2401.16013" title="Abstract" id="2401.16013"> arXiv:2401.16013 </a> (replaced) [<a href="/pdf/2401.16013" title="Download PDF" id="pdf-2401.16013" aria-labelledby="pdf-2401.16013">pdf</a>, <a href="https://arxiv.org/html/2401.16013v4" title="View HTML" id="html-2401.16013" aria-labelledby="html-2401.16013" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2401.16013" title="Other formats" id="oth-2401.16013" aria-labelledby="oth-2401.16013">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SERL: A Software Suite for Sample-Efficient Robotic Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jianlan Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Z">Zheyuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Charles Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tan,+Y+L">You Liang Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Berg,+J">Jacob Berg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sharma,+A">Archit Sharma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schaal,+S">Stefan Schaal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Finn,+C">Chelsea Finn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+A">Abhishek Gupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Levine,+S">Sergey Levine</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICRA 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In recent years, significant progress has been made in the field of robotic reinforcement learning (RL), enabling methods that handle complex image observations, train in the real world, and incorporate auxiliary data, such as demonstrations and prior experience. However, despite these advances, robotic RL remains hard to use. It is acknowledged among practitioners that the particular implementation details of these algorithms are often just as important (if not more so) for performance as the choice of algorithm. We posit that a significant challenge to widespread adoption of robotic RL, as well as further development of robotic RL methods, is the comparative inaccessibility of such methods. To address this challenge, we developed a carefully implemented library containing a sample efficient off-policy deep RL method, together with methods for computing rewards and resetting the environment, a high-quality controller for a widely-adopted robot, and a number of challenging example tasks. We provide this library as a resource for the community, describe its design choices, and present experimental results. Perhaps surprisingly, we find that our implementation can achieve very efficient learning, acquiring policies for PCB board assembly, cable routing, and object relocation between 25 to 50 minutes of training per policy on average, improving over state-of-the-art results reported for similar tasks in the literature. These policies achieve perfect or near-perfect success rates, extreme robustness even under perturbations, and exhibit emergent recovery and correction behaviors. We hope that these promising results and our high-quality open-source implementation will provide a tool for the robotics community to facilitate further developments in robotic RL. Our code, documentation, and videos can be found at <a href="https://serl-robot.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2403.10996" title="Abstract" id="2403.10996"> arXiv:2403.10996 </a> (replaced) [<a href="/pdf/2403.10996" title="Download PDF" id="pdf-2403.10996" aria-labelledby="pdf-2403.10996">pdf</a>, <a href="https://arxiv.org/html/2403.10996v5" title="View HTML" id="html-2403.10996" aria-labelledby="html-2403.10996" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.10996" title="Other formats" id="oth-2403.10996" aria-labelledby="oth-2403.10996">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixed-Reality Digital Twins: Leveraging the Physical and Virtual Worlds for Hybrid Sim2Real Transition of Multi-Agent Reinforcement Learning Policies </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Samak,+C+V">Chinmay Vilas Samak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Samak,+T+V">Tanmay Vilas Samak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krovi,+V+N">Venkat Narayan Krovi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Machine Learning (cs.LG); Multiagent Systems (cs.MA) </div> <p class='mathjax'> Multi-agent reinforcement learning (MARL) for cyber-physical vehicle systems usually requires a significantly long training time due to their inherent complexity. Furthermore, deploying the trained policies in the real world demands a feature-rich environment along with multiple physical embodied agents, which may not be feasible due to monetary, physical, energy, or safety constraints. This work seeks to address these pain points by presenting a mixed-reality digital twin framework capable of: (i) selectively scaling parallelized workloads on-demand, and (ii) evaluating the trained policies across simulation-to-reality (sim2real) experiments. The viability and performance of the proposed framework are highlighted through two representative use cases, which cover cooperative as well as competitive classes of MARL problems. We study the effect of: (i) agent and environment parallelization on training time, and (ii) systematic domain randomization on zero-shot sim2real transfer across both case studies. Results indicate up to 76.3% reduction in training time with the proposed parallelization scheme and sim2real gap as low as 2.9% using the proposed deployment method. </p> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2405.20031" title="Abstract" id="2405.20031"> arXiv:2405.20031 </a> (replaced) [<a href="/pdf/2405.20031" title="Download PDF" id="pdf-2405.20031" aria-labelledby="pdf-2405.20031">pdf</a>, <a href="https://arxiv.org/html/2405.20031v3" title="View HTML" id="html-2405.20031" aria-labelledby="html-2405.20031" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.20031" title="Other formats" id="oth-2405.20031" aria-labelledby="oth-2405.20031">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MG-SLAM: Structure Gaussian Splatting SLAM with Manhattan World Hypothesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Shuhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+T">Tianchen Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Heng Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Liuzhuozheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hongyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Danwei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Mingrui Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Gaussian Splatting SLAMs have made significant advancements in improving the efficiency and fidelity of real-time reconstructions. However, these systems often encounter incomplete reconstructions in complex indoor environments, characterized by substantial holes due to unobserved geometry caused by obstacles or limited view angles. To address this challenge, we present Manhattan Gaussian SLAM, an RGB-D system that leverages the Manhattan World hypothesis to enhance geometric accuracy and completeness. By seamlessly integrating fused line segments derived from structured scenes, our method ensures robust tracking in textureless indoor areas. Moreover, The extracted lines and planar surface assumption allow strategic interpolation of new Gaussians in regions of missing geometry, enabling efficient scene completion. Extensive experiments conducted on both synthetic and real-world scenes demonstrate that these advancements enable our method to achieve state-of-the-art performance, marking a substantial improvement in the capabilities of Gaussian SLAM systems. </p> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2407.00507" title="Abstract" id="2407.00507"> arXiv:2407.00507 </a> (replaced) [<a href="/pdf/2407.00507" title="Download PDF" id="pdf-2407.00507" aria-labelledby="pdf-2407.00507">pdf</a>, <a href="https://arxiv.org/html/2407.00507v2" title="View HTML" id="html-2407.00507" aria-labelledby="html-2407.00507" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.00507" title="Other formats" id="oth-2407.00507" aria-labelledby="oth-2407.00507">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AVOCADO: Adaptive Optimal Collision Avoidance driven by Opinion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Martinez-Baselga,+D">Diego Martinez-Baselga</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sebasti%C3%A1n,+E">Eduardo Sebasti谩n</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Montijano,+E">Eduardo Montijano</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Riazuelo,+L">Luis Riazuelo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sag%C3%BC%C3%A9s,+C">Carlos Sag眉茅s</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Montano,+L">Luis Montano</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This paper is published at IEEE Transactions on Robotics under DOI <a href="https://doi.org/10.1109/TRO.2025.3552350" data-doi="10.1109/TRO.2025.3552350" class="link-https link-external" rel="external noopener nofollow">https://doi.org/10.1109/TRO.2025.3552350</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> We present AVOCADO (AdaptiVe Optimal Collision Avoidance Driven by Opinion), a novel navigation approach to address holonomic robot collision avoidance when the degree of cooperation of the other agents in the environment is unknown. AVOCADO departs from a Velocity Obstacle's formulation akin to the Optimal Reciprocal Collision Avoidance method. However, instead of assuming reciprocity, AVOCADO poses an adaptive control problem that aims at adapting in real-time to the cooperation degree of other robots and agents. Adaptation is achieved through a novel nonlinear opinion dynamics design that relies solely on sensor observations. As a by-product, based on the nonlinear opinion dynamics, we propose a novel method to avoid the deadlocks under geometrical symmetries among robots and agents. Extensive numerical simulations show that AVOCADO surpasses existing geometrical, learning and planning-based approaches in mixed cooperative/non-cooperative navigation environments in terms of success rate, time to goal and computational time. In addition, we conduct multiple real experiments that verify that AVOCADO is able to avoid collisions in environments crowded with other robots and humans. </p> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2408.09612" title="Abstract" id="2408.09612"> arXiv:2408.09612 </a> (replaced) [<a href="/pdf/2408.09612" title="Download PDF" id="pdf-2408.09612" aria-labelledby="pdf-2408.09612">pdf</a>, <a href="https://arxiv.org/html/2408.09612v2" title="View HTML" id="html-2408.09612" aria-labelledby="html-2408.09612" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.09612" title="Other formats" id="oth-2408.09612" aria-labelledby="oth-2408.09612">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ContactSDF: Signed Distance Functions as Multi-Contact Models for Dexterous Manipulation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+W">Wen Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+W">Wanxin Jin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> In this paper, we propose ContactSDF, a method that uses signed distance functions (SDFs) to approximate multi-contact models, including both collision detection and time-stepping routines. ContactSDF first establishes an SDF using the supporting plane representation of an object for collision detection, and then uses the generated contact dual cones to build a second SDF for time-stepping prediction of the next state. Those two SDFs create a differentiable and closed-form multi-contact dynamic model for state prediction, enabling efficient model learning and optimization for contact-rich manipulation. We perform extensive simulation experiments to show the effectiveness of ContactSDF for model learning and real-time control of dexterous manipulation. We further evaluate the ContactSDF on a hardware Allegro hand for on-palm reorientation tasks. Results show with around 2 minutes of learning on hardware, the ContactSDF achieves high-quality dexterous manipulation at a frequency of 30-60Hz. Project page <a href="https://yangwen-1102.github.io/contactsdf.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2409.09849" title="Abstract" id="2409.09849"> arXiv:2409.09849 </a> (replaced) [<a href="/pdf/2409.09849" title="Download PDF" id="pdf-2409.09849" aria-labelledby="pdf-2409.09849">pdf</a>, <a href="/format/2409.09849" title="Other formats" id="oth-2409.09849" aria-labelledby="oth-2409.09849">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dynamic Layer Detection of a Thin Materials using DenseTact Optical Tactile Sensors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dhawan,+A+K">Ankush Kundan Dhawan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chungyoun,+C">Camille Chungyoun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ting,+K">Karina Ting</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kennedy,+M">Monroe Kennedy III</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 9 figures, submitted to IROS 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> Manipulation of thin materials is critical for many everyday tasks and remains a significant challenge for robots. While existing research has made strides in tasks like material smoothing and folding, many studies struggle with common failure modes (crumpled corners/edges, incorrect grasp con-figurations) that a preliminary step of layer detection can solve. We present a novel method for classifying the number of grasped material layers using a custom gripper equipped with DenseTact 2.0 optical tactile sensors. After grasping a thin material, the gripper performs an anthropomorphic rubbing motion while collecting optical flow, 6-axis wrench, and joint state data. Using this data in a transformer-based network achieves a test accuracy of 98.21% in correctly classifying the number of grasped cloth layers, and 81.25% accuracy in classifying layers of grasped paper, showing the effectiveness of our dynamic rubbing method. Evaluating different inputs and model architectures highlights the usefulness of tactile sensor information and a transformer model for this task. A comprehensive dataset of 568 labeled trials (368 for cloth and 200 for paper) was collected and made open-source along with this paper. Our project page is available at <a href="https://armlabstanford.github.io/dynamic-cloth-detection" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2410.03035" title="Abstract" id="2410.03035"> arXiv:2410.03035 </a> (replaced) [<a href="/pdf/2410.03035" title="Download PDF" id="pdf-2410.03035" aria-labelledby="pdf-2410.03035">pdf</a>, <a href="/format/2410.03035" title="Other formats" id="oth-2410.03035" aria-labelledby="oth-2410.03035">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SPINE: Online Semantic Planning for Missions with Incomplete Natural Language Specifications in Unstructured Environments </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ravichandran,+Z">Zachary Ravichandran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Murali,+V">Varun Murali</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tzes,+M">Mariliza Tzes</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pappas,+G+J">George J. Pappas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+V">Vijay Kumar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to the International Conference on Robotics and Automation (ICRA) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> As robots become increasingly capable, users will want to describe high-level missions and have robots infer the relevant details. because pre-built maps are difficult to obtain in many realistic settings, accomplishing such missions will require the robot to map and plan online. while many semantic planning methods operate online, they are typically designed for well specified missions such as object search or exploration. recently, large language models (LLMs) have demonstrated powerful contextual reasoning abilities over a range of robotic tasks described in natural language. however, existing LLM-enabled planners typically do not consider online planning or complex missions; rather, relevant subtasks and semantics are provided by a pre-built map or a user. we address these limitations via spine, an online planner for missions with incomplete mission specifications provided in natural language. the planner uses an LLM to reason about subtasks implied by the mission specification and then realizes these subtasks in a receding horizon framework. tasks are automatically validated for safety and refined online with new map observations. we evaluate spine in simulation and real-world settings with missions that require multiple steps of semantic reasoning and exploration in cluttered outdoor environments of over 20,000m$^2$. compared to baselines that use existing LLM-enabled planning approaches, our method is over twice as efficient in terms of time and distance, requires less user interactions, and does not require a full map. Additional resources are provided at: <a href="https://zacravichandran.github.io/SPINE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2410.16591" title="Abstract" id="2410.16591"> arXiv:2410.16591 </a> (replaced) [<a href="/pdf/2410.16591" title="Download PDF" id="pdf-2410.16591" aria-labelledby="pdf-2410.16591">pdf</a>, <a href="https://arxiv.org/html/2410.16591v2" title="View HTML" id="html-2410.16591" aria-labelledby="html-2410.16591" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.16591" title="Other formats" id="oth-2410.16591" aria-labelledby="oth-2410.16591">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cycloidal Quasi-Direct Drive Actuator Designs with Learning-based Torque Estimation for Legged Robotics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+A">Alvin Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tanaka,+Y">Yusuke Tanaka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rafeedi,+F">Fadi Rafeedi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+D">Dennis Hong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Proceeding to 2025 IEEE International Conference on Robotics and Automation (ICRA25) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> This paper presents a novel approach through the design and implementation of Cycloidal Quasi-Direct Drive actuators for legged robotics. The cycloidal gear mechanism, with its inherent high torque density and mechanical robustness, offers significant advantages over conventional designs. By integrating cycloidal gears into the Quasi-Direct Drive framework, we aim to enhance the performance of legged robots, particularly in tasks demanding high torque and dynamic loads, while still keeping them lightweight. Additionally, we develop a torque estimation framework for the actuator using an Actuator Network, which effectively reduces the sim-to-real gap introduced by the cycloidal drive's complex dynamics. This integration is crucial for capturing the complex dynamics of a cycloidal drive, which contributes to improved learning efficiency, agility, and adaptability for reinforcement learning. </p> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2410.17524" title="Abstract" id="2410.17524"> arXiv:2410.17524 </a> (replaced) [<a href="/pdf/2410.17524" title="Download PDF" id="pdf-2410.17524" aria-labelledby="pdf-2410.17524">pdf</a>, <a href="https://arxiv.org/html/2410.17524v4" title="View HTML" id="html-2410.17524" aria-labelledby="html-2410.17524" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.17524" title="Other formats" id="oth-2410.17524" aria-labelledby="oth-2410.17524">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mechanisms and Computational Design of Multi-Modal End-Effector with Force Sensing using Gated Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tanaka,+Y">Yusuke Tanaka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+A">Alvin Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+R">Richard Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mehta,+A">Ankur Mehta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+D">Dennis Hong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Proceeding to 2025 IEEE International Conference on Robotics and Automation (ICRA25) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> In limbed robotics, end-effectors must serve dual functions, such as both feet for locomotion and grippers for grasping, which presents design challenges. This paper introduces a multi-modal end-effector capable of transitioning between flat and line foot configurations while providing grasping capabilities. MAGPIE integrates 8-axis force sensing using proposed mechanisms with hall effect sensors, enabling both contact and tactile force measurements. We present a computational design framework for our sensing mechanism that accounts for noise and interference, allowing for desired sensitivity and force ranges and generating ideal inverse models. The hardware implementation of MAGPIE is validated through experiments, demonstrating its capability as a foot and verifying the performance of the sensing mechanisms, ideal models, and gated network-based models. </p> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2410.21845" title="Abstract" id="2410.21845"> arXiv:2410.21845 </a> (replaced) [<a href="/pdf/2410.21845" title="Download PDF" id="pdf-2410.21845" aria-labelledby="pdf-2410.21845">pdf</a>, <a href="https://arxiv.org/html/2410.21845v3" title="View HTML" id="html-2410.21845" aria-labelledby="html-2410.21845" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.21845" title="Other formats" id="oth-2410.21845" aria-labelledby="oth-2410.21845">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Precise and Dexterous Robotic Manipulation via Human-in-the-Loop Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jianlan Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+C">Charles Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+J">Jeffrey Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Levine,+S">Sergey Levine</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Reinforcement learning (RL) holds great promise for enabling autonomous acquisition of complex robotic manipulation skills, but realizing this potential in real-world settings has been challenging. We present a human-in-the-loop vision-based RL system that demonstrates impressive performance on a diverse set of dexterous manipulation tasks, including dynamic manipulation, precision assembly, and dual-arm coordination. Our approach integrates demonstrations and human corrections, efficient RL algorithms, and other system-level design choices to learn policies that achieve near-perfect success rates and fast cycle times within just 1 to 2.5 hours of training. We show that our method significantly outperforms imitation learning baselines and prior RL approaches, with an average 2x improvement in success rate and 1.8x faster execution. Through extensive experiments and analysis, we provide insights into the effectiveness of our approach, demonstrating how it learns robust, adaptive policies for both reactive and predictive control strategies. Our results suggest that RL can indeed learn a wide range of complex vision-based manipulation policies directly in the real world within practical training times. We hope this work will inspire a new generation of learned robotic manipulation techniques, benefiting both industrial applications and research advancements. Videos and code are available at our project website <a href="https://hil-serl.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2412.03142" title="Abstract" id="2412.03142"> arXiv:2412.03142 </a> (replaced) [<a href="/pdf/2412.03142" title="Download PDF" id="pdf-2412.03142" aria-labelledby="pdf-2412.03142">pdf</a>, <a href="https://arxiv.org/html/2412.03142v2" title="View HTML" id="html-2412.03142" aria-labelledby="html-2412.03142" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.03142" title="Other formats" id="oth-2412.03142" aria-labelledby="oth-2412.03142">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AffordDP: Generalizable Diffusion Policy with Transferable Affordance </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Shijie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yihang Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yunao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+K">Kaizhen Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+J">Jiayuan Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jingyi Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Y">Ye Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jingya Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span> </div> <p class='mathjax'> Diffusion-based policies have shown impressive performance in robotic manipulation tasks while struggling with out-of-domain distributions. Recent efforts attempted to enhance generalization by improving the visual feature encoding for diffusion policy. However, their generalization is typically limited to the same category with similar appearances. Our key insight is that leveraging affordances--manipulation priors that define "where" and "how" an agent interacts with an object--can substantially enhance generalization to entirely unseen object instances and categories. We introduce the Diffusion Policy with transferable Affordance (AffordDP), designed for generalizable manipulation across novel categories. AffordDP models affordances through 3D contact points and post-contact trajectories, capturing the essential static and dynamic information for complex tasks. The transferable affordance from in-domain data to unseen objects is achieved by estimating a 6D transformation matrix using foundational vision models and point cloud registration techniques. More importantly, we incorporate affordance guidance during diffusion sampling that can refine action sequence generation. This guidance directs the generated action to gradually move towards the desired manipulation for unseen objects while keeping the generated action within the manifold of action space. Experimental results from both simulated and real-world environments demonstrate that AffordDP consistently outperforms previous diffusion-based methods, successfully generalizing to unseen instances and categories where others fail. </p> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2412.04445" title="Abstract" id="2412.04445"> arXiv:2412.04445 </a> (replaced) [<a href="/pdf/2412.04445" title="Download PDF" id="pdf-2412.04445" aria-labelledby="pdf-2412.04445">pdf</a>, <a href="https://arxiv.org/html/2412.04445v2" title="View HTML" id="html-2412.04445" aria-labelledby="html-2412.04445" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.04445" title="Other formats" id="oth-2412.04445" aria-labelledby="oth-2412.04445">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Moto: Latent Motion Token as the Bridging Language for Learning Robot Manipulation from Videos </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Y">Yuying Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+W">Weiliang Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yizhuo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Y">Yixiao Ge</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+M">Mingyu Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+Y">Ying Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xihui Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project released at: <a href="https://chenyi99.github.io/moto/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> Update: Added content related to real-world robot experiments and learning from human videos </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) </div> <p class='mathjax'> Recent developments in Large Language Models pre-trained on extensive corpora have shown significant success in various natural language processing tasks with minimal fine-tuning. This success offers new promise for robotics, which has long been constrained by the high cost of action-labeled data. We ask: given the abundant video data containing interaction-related knowledge available as a rich "corpus", can a similar generative pre-training approach be effectively applied to enhance robot learning? The key challenge is to identify an effective representation for autoregressive pre-training that benefits robot manipulation tasks. Inspired by the way humans learn new skills through observing dynamic environments, we propose that effective robotic learning should emphasize motion-related knowledge, which is closely tied to low-level actions and is hardware-agnostic, facilitating the transfer of learned motions to actual robot actions. To this end, we introduce Moto, which converts video content into latent Motion Token sequences by a Latent Motion Tokenizer, learning a bridging "language" of motion from videos in an unsupervised manner. We pre-train Moto-GPT through motion token autoregression, enabling it to capture diverse visual motion knowledge. After pre-training, Moto-GPT demonstrates the promising ability to produce semantically interpretable motion tokens, predict plausible motion trajectories, and assess trajectory rationality through output likelihood. To transfer learned motion priors to real robot actions, we implement a co-fine-tuning strategy that seamlessly bridges latent motion token prediction and real robot control. Extensive experiments show that the fine-tuned Moto-GPT exhibits superior robustness and efficiency on robot manipulation benchmarks, underscoring its effectiveness in transferring knowledge from video data to downstream visual manipulation tasks. </p> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2503.07013" title="Abstract" id="2503.07013"> arXiv:2503.07013 </a> (replaced) [<a href="/pdf/2503.07013" title="Download PDF" id="pdf-2503.07013" aria-labelledby="pdf-2503.07013">pdf</a>, <a href="https://arxiv.org/html/2503.07013v2" title="View HTML" id="html-2503.07013" aria-labelledby="html-2503.07013" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.07013" title="Other formats" id="oth-2503.07013" aria-labelledby="oth-2503.07013">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning Nash Equilibrial Hamiltonian for Two-Player Collision-Avoiding Interactions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Lei Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Das,+S">Siddharth Das</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Merry,+T">Tanner Merry</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wenlong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+Y">Yi Ren</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by 2025 ACC </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computer Science and Game Theory (cs.GT); Machine Learning (cs.LG) </div> <p class='mathjax'> We consider the problem of learning Nash equilibrial policies for two-player risk-sensitive collision-avoiding interactions. Solving the Hamilton-Jacobi-Isaacs equations of such general-sum differential games in real time is an open challenge due to the discontinuity of equilibrium values on the state space. A common solution is to learn a neural network that approximates the equilibrium Hamiltonian for given system states and actions. The learning, however, is usually supervised and requires a large amount of sample equilibrium policies from different initial states in order to mitigate the risks of collisions. This paper claims two contributions towards more data-efficient learning of equilibrium policies: First, instead of computing Hamiltonian through a value network, we show that the equilibrium co-states have simple structures when collision avoidance dominates the agents' loss functions and system dynamics is linear, and therefore are more data-efficient to learn. Second, we introduce theory-driven active learning to guide data sampling, where the acquisition function measures the compliance of the predicted co-states to Pontryagin's Maximum Principle. On an uncontrolled intersection case, the proposed method leads to more generalizable approximation of the equilibrium policies, and in turn, lower collision probabilities, than the state-of-the-art under the same data acquisition budget. </p> </div> </dd> <dt> <a name='item51'>[51]</a> <a href ="/abs/2503.15370" title="Abstract" id="2503.15370"> arXiv:2503.15370 </a> (replaced) [<a href="/pdf/2503.15370" title="Download PDF" id="pdf-2503.15370" aria-labelledby="pdf-2503.15370">pdf</a>, <a href="/format/2503.15370" title="Other formats" id="oth-2503.15370" aria-labelledby="oth-2503.15370">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Tangles: Unpacking Extended Collision Experiences with Soma Trajectories </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Benford,+S">Steve Benford</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garrett,+R">Rachael Garrett</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Christine Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tennent,+P">Paul Tennent</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=N%C3%BA%C3%B1ez-Pacheco,+C">Claudia N煤帽ez-Pacheco</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kucukyilmaz,+A">Ayse Kucukyilmaz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsaknaki,+V">Vasiliki Tsaknaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=H%C3%B6%C3%B6k,+K">Kristina H枚枚k</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Caleb-Solly,+P">Praminda Caleb-Solly</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marshall,+J">Joe Marshall</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schneiders,+E">Eike Schneiders</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Popova,+K">Kristina Popova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Afana,+J">Jude Afana</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 32 pages, 13 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> We reappraise the idea of colliding with robots, moving from a position that tries to avoid or mitigate collisions to one that considers them an important facet of human interaction. We report on a soma design workshop that explored how our bodies could collide with telepresence robots, mobility aids, and a quadruped robot. Based on our findings, we employed soma trajectories to analyse collisions as extended experiences that negotiate key transitions of consent, preparation, launch, contact, ripple, sting, untangle, debris and reflect. We then employed these ideas to analyse two collision experiences, an accidental collision between a person and a drone, and the deliberate design of a robot to play with cats, revealing how real-world collisions involve the complex and ongoing entanglement of soma trajectories. We discuss how viewing collisions as entangled trajectories, or tangles, can be used analytically, as a design approach, and as a lens to broach ethical complexity. </p> </div> </dd> <dt> <a name='item52'>[52]</a> <a href ="/abs/2310.17178" title="Abstract" id="2310.17178"> arXiv:2310.17178 </a> (replaced) [<a href="/pdf/2310.17178" title="Download PDF" id="pdf-2310.17178" aria-labelledby="pdf-2310.17178">pdf</a>, <a href="https://arxiv.org/html/2310.17178v2" title="View HTML" id="html-2310.17178" aria-labelledby="html-2310.17178" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2310.17178" title="Other formats" id="oth-2310.17178" aria-labelledby="oth-2310.17178">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Relational Object-Centric Actor-Critic </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ugadiarov,+L">Leonid Ugadiarov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vorobyov,+V">Vitaliy Vorobyov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Panov,+A+I">Aleksandr I. Panov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> The advances in unsupervised object-centric representation learning have significantly improved its application to downstream tasks. Recent works highlight that disentangled object representations can aid policy learning in image-based, object-centric reinforcement learning tasks. This paper proposes a novel object-centric reinforcement learning algorithm that integrates actor-critic and model-based approaches by incorporating an object-centric world model within the critic. The world model captures the environment's data-generating process by predicting the next state and reward given the current state-action pair, where actions are interventions in the environment. In model-based reinforcement learning, world model learning can be interpreted as a causal induction problem, where the agent must learn the causal relationships underlying the environment's dynamics. We evaluate our method in a simulated 3D robotic environment and a 2D environment with compositional structure. As baselines, we compare against object-centric, model-free actor-critic algorithms and a state-of-the-art monolithic model-based algorithm. While the baselines show comparable performance in easier tasks, our approach outperforms them in more challenging scenarios with a large number of objects or more complex dynamics. </p> </div> </dd> <dt> <a name='item53'>[53]</a> <a href ="/abs/2409.16502" title="Abstract" id="2409.16502"> arXiv:2409.16502 </a> (replaced) [<a href="/pdf/2409.16502" title="Download PDF" id="pdf-2409.16502" aria-labelledby="pdf-2409.16502">pdf</a>, <a href="https://arxiv.org/html/2409.16502v3" title="View HTML" id="html-2409.16502" aria-labelledby="html-2409.16502" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.16502" title="Other formats" id="oth-2409.16502" aria-labelledby="oth-2409.16502">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for Improved Visual Localization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sidorov,+G">Gennady Sidorov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mohrat,+M">Malik Mohrat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gridusov,+D">Denis Gridusov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rakhimov,+R">Ruslan Rakhimov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kolyubin,+S">Sergey Kolyubin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project website at <a href="https://gsplatloc.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> Although various visual localization approaches exist, such as scene coordinate regression and camera pose regression, these methods often struggle with optimization complexity or limited accuracy. To address these challenges, we explore the use of novel view synthesis techniques, particularly 3D Gaussian Splatting (3DGS), which enables the compact encoding of both 3D geometry and scene appearance. We propose a two-stage procedure that integrates dense and robust keypoint descriptors from the lightweight XFeat feature extractor into 3DGS, enhancing performance in both indoor and outdoor environments. The coarse pose estimates are directly obtained via 2D-3D correspondences between the 3DGS representation and query image descriptors. In the second stage, the initial pose estimate is refined by minimizing the rendering-based photometric warp loss. Benchmarking on widely used indoor and outdoor datasets demonstrates improvements over recent neural rendering-based localization methods, such as NeRFMatch and PNeRFLoc. </p> </div> </dd> <dt> <a name='item54'>[54]</a> <a href ="/abs/2410.04260" title="Abstract" id="2410.04260"> arXiv:2410.04260 </a> (replaced) [<a href="/pdf/2410.04260" title="Download PDF" id="pdf-2410.04260" aria-labelledby="pdf-2410.04260">pdf</a>, <a href="https://arxiv.org/html/2410.04260v2" title="View HTML" id="html-2410.04260" aria-labelledby="html-2410.04260" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.04260" title="Other formats" id="oth-2410.04260" aria-labelledby="oth-2410.04260">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pareto Control Barrier Function for Inner Safe Set Maximization Under Input Constraints </div> <div class='list-authors'><a href="https://arxiv.org/search/math?searchtype=author&query=Cao,+X">Xiaoyang Cao</a>, <a href="https://arxiv.org/search/math?searchtype=author&query=Fu,+Z">Zhe Fu</a>, <a href="https://arxiv.org/search/math?searchtype=author&query=Bayen,+A+M">Alexandre M. Bayen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for presentation at American Control Conference 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Optimization and Control (math.OC)</span>; Artificial Intelligence (cs.AI); Robotics (cs.RO) </div> <p class='mathjax'> This article introduces the Pareto Control Barrier Function (PCBF) algorithm to maximize the inner safe set of dynamical systems under input constraints. Traditional Control Barrier Functions (CBFs) ensure safety by maintaining system trajectories within a safe set but often fail to account for realistic input constraints. To address this problem, we leverage the Pareto multi-task learning framework to balance competing objectives of safety and safe set volume. The PCBF algorithm is applicable to high-dimensional systems and is computationally efficient. We validate its effectiveness through comparison with Hamilton-Jacobi reachability for an inverted pendulum and through simulations on a 12-dimensional quadrotor system. Results show that the PCBF consistently outperforms existing methods, yielding larger safe sets and ensuring safety under input constraints. </p> </div> </dd> <dt> <a name='item55'>[55]</a> <a href ="/abs/2501.04004" title="Abstract" id="2501.04004"> arXiv:2501.04004 </a> (replaced) [<a href="/pdf/2501.04004" title="Download PDF" id="pdf-2501.04004" aria-labelledby="pdf-2501.04004">pdf</a>, <a href="https://arxiv.org/html/2501.04004v2" title="View HTML" id="html-2501.04004" aria-labelledby="html-2501.04004" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.04004" title="Other formats" id="oth-2501.04004" aria-labelledby="oth-2501.04004">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+X">Xiang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+L">Lingdong Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shuai,+H">Hui Shuai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+L">Liang Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Ziwei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qingshan Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025; 27 pages, 17 figures, 10 tables; Project Page at <a href="https://ldkong.com/LiMoE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Machine Learning (cs.LG); Robotics (cs.RO) </div> <p class='mathjax'> LiDAR data pretraining offers a promising approach to leveraging large-scale, readily available datasets for enhanced data utilization. However, existing methods predominantly focus on sparse voxel representation, overlooking the complementary attributes provided by other LiDAR representations. In this work, we propose LiMoE, a framework that integrates the Mixture of Experts (MoE) paradigm into LiDAR data representation learning to synergistically combine multiple representations, such as range images, sparse voxels, and raw points. Our approach consists of three stages: i) Image-to-LiDAR Pretraining, which transfers prior knowledge from images to point clouds across different representations; ii) Contrastive Mixture Learning (CML), which uses MoE to adaptively activate relevant attributes from each representation and distills these mixed features into a unified 3D network; iii) Semantic Mixture Supervision (SMS), which combines semantic logits from multiple representations to boost downstream segmentation performance. Extensive experiments across eleven large-scale LiDAR datasets demonstrate our effectiveness and superiority. The code has been made publicly accessible. </p> </div> </dd> <dt> <a name='item56'>[56]</a> <a href ="/abs/2501.13928" title="Abstract" id="2501.13928"> arXiv:2501.13928 </a> (replaced) [<a href="/pdf/2501.13928" title="Download PDF" id="pdf-2501.13928" aria-labelledby="pdf-2501.13928">pdf</a>, <a href="https://arxiv.org/html/2501.13928v2" title="View HTML" id="html-2501.13928" aria-labelledby="html-2501.13928" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.13928" title="Other formats" id="oth-2501.13928" aria-labelledby="oth-2501.13928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jianing Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sax,+A">Alexander Sax</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+K+J">Kevin J. Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Henaff,+M">Mikael Henaff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+H">Hao Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+A">Ang Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chai,+J">Joyce Chai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meier,+F">Franziska Meier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feiszli,+M">Matt Feiszli</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025. Project website: <a href="https://fast3r-3d.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Graphics (cs.GR); Robotics (cs.RO) </div> <p class='mathjax'> Multi-view 3D reconstruction remains a core challenge in computer vision, particularly in applications requiring accurate and scalable representations across diverse perspectives. Current leading methods such as DUSt3R employ a fundamentally pairwise approach, processing images in pairs and necessitating costly global alignment procedures to reconstruct from multiple views. In this work, we propose Fast 3D Reconstruction (Fast3R), a novel multi-view generalization to DUSt3R that achieves efficient and scalable 3D reconstruction by processing many views in parallel. Fast3R's Transformer-based architecture forwards N images in a single forward pass, bypassing the need for iterative alignment. Through extensive experiments on camera pose estimation and 3D reconstruction, Fast3R demonstrates state-of-the-art performance, with significant improvements in inference speed and reduced error accumulation. These results establish Fast3R as a robust alternative for multi-view applications, offering enhanced scalability without compromising reconstruction accuracy. </p> </div> </dd> <dt> <a name='item57'>[57]</a> <a href ="/abs/2503.10745" title="Abstract" id="2503.10745"> arXiv:2503.10745 </a> (replaced) [<a href="/pdf/2503.10745" title="Download PDF" id="pdf-2503.10745" aria-labelledby="pdf-2503.10745">pdf</a>, <a href="https://arxiv.org/html/2503.10745v2" title="View HTML" id="html-2503.10745" aria-labelledby="html-2503.10745" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.10745" title="Other formats" id="oth-2503.10745" aria-labelledby="oth-2503.10745">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unifying 2D and 3D Vision-Language Understanding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jain,+A">Ayush Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Swerdlow,+A">Alexander Swerdlow</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuzhou Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arnaud,+S">Sergio Arnaud</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Martin,+A">Ada Martin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sax,+A">Alexander Sax</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meier,+F">Franziska Meier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fragkiadaki,+K">Katerina Fragkiadaki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The first two authors contributed equally </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Robotics (cs.RO) </div> <p class='mathjax'> Progress in 3D vision-language learning has been hindered by the scarcity of large-scale 3D datasets. We introduce UniVLG, a unified architecture for 2D and 3D vision-language understanding that bridges the gap between existing 2D-centric models and the rich 3D sensory data available in embodied systems. Our approach initializes most model weights from pre-trained 2D models and trains on both 2D and 3D vision-language data. We propose a novel language-conditioned mask decoder shared across 2D and 3D modalities to ground objects effectively in both RGB and RGB-D images, outperforming box-based approaches. To further reduce the domain gap between 2D and 3D, we incorporate 2D-to-3D lifting strategies, enabling UniVLG to utilize 2D data to enhance 3D performance. With these innovations, our model achieves state-of-the-art performance across multiple 3D vision-language grounding tasks, demonstrating the potential of transferring advances from 2D vision-language learning to the data-constrained 3D domain. Furthermore, co-training on both 2D and 3D data enhances performance across modalities without sacrificing 2D capabilities. By removing the reliance on 3D mesh reconstruction and ground-truth object proposals, UniVLG sets a new standard for realistic, embodied-aligned evaluation. Code and additional visualizations are available at <a href="https://univlg.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> . </p> </div> </dd> <dt> <a name='item58'>[58]</a> <a href ="/abs/2503.13996" title="Abstract" id="2503.13996"> arXiv:2503.13996 </a> (replaced) [<a href="/pdf/2503.13996" title="Download PDF" id="pdf-2503.13996" aria-labelledby="pdf-2503.13996">pdf</a>, <a href="https://arxiv.org/html/2503.13996v2" title="View HTML" id="html-2503.13996" aria-labelledby="html-2503.13996" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.13996" title="Other formats" id="oth-2503.13996" aria-labelledby="oth-2503.13996">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robust Safety Critical Control Under Multiple State and Input Constraints: Volume Control Barrier Function Method </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Dong,+J">Jinyang Dong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+S">Shizhen Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liu,+R">Rui Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liang,+X">Xiao Liang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lu,+B">Biao Lu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Fang,+Y">Yongchun Fang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Systems and Control (eess.SY)</span>; Robotics (cs.RO) </div> <p class='mathjax'> In this paper, the safety-critical control problem for uncertain systems under multiple control barrier function (CBF) constraints and input constraints is investigated. A novel framework is proposed to generate a safety filter that minimizes changes to reference inputs when safety risks arise, ensuring a balance between safety and performance. A nonlinear disturbance observer (DOB) based on the robust integral of the sign of the error (RISE) is used to estimate system uncertainties, ensuring that the estimation error converges to zero exponentially. This error bound is integrated into the safety-critical controller to reduce conservativeness while ensuring safety. To further address the challenges arising from multiple CBF and input constraints, a novel Volume CBF (VCBF) is proposed by analyzing the feasible space of the quadratic programming (QP) problem. % ensuring solution feasibility by keeping the volume as a positive value. To ensure that the feasible space does not vanish under disturbances, a DOB-VCBF-based method is introduced, ensuring system safety while maintaining the feasibility of the resulting QP. Subsequently, several groups of simulation and experimental results are provided to validate the effectiveness of the proposed controller. </p> </div> </dd> </dl> <div class='paging'>Total of 58 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.RO/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>