CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 703 results for author: <span class="mathjax">Zhou, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Zhou%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhou, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhou%2C+B&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhou, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06703">arXiv:2502.06703</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06703">pdf</a>, <a href="https://arxiv.org/format/2502.06703">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can 1B LLM Surpass 405B LLM? Rethinking Compute-Optimal Test-Time Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Runze Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Junqi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+B">Biqing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06703v1-abstract-short" style="display: inline;"> Test-Time Scaling (TTS) is an important method for improving the performance of Large Language Models (LLMs) by using additional computation during the inference phase. However, current studies do not systematically analyze how policy models, Process Reward Models (PRMs), and problem difficulty influence TTS. This lack of analysis limits the understanding and practical use of TTS methods. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06703v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06703v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06703v1-abstract-full" style="display: none;"> Test-Time Scaling (TTS) is an important method for improving the performance of Large Language Models (LLMs) by using additional computation during the inference phase. However, current studies do not systematically analyze how policy models, Process Reward Models (PRMs), and problem difficulty influence TTS. This lack of analysis limits the understanding and practical use of TTS methods. In this paper, we focus on two core questions: (1) What is the optimal approach to scale test-time computation across different policy models, PRMs, and problem difficulty levels? (2) To what extent can extended computation improve the performance of LLMs on complex tasks, and can smaller language models outperform larger ones through this approach? Through comprehensive experiments on MATH-500 and challenging AIME24 tasks, we have the following observations: (1) The compute-optimal TTS strategy is highly dependent on the choice of policy model, PRM, and problem difficulty. (2) With our compute-optimal TTS strategy, extremely small policy models can outperform larger models. For example, a 1B LLM can exceed a 405B LLM on MATH-500. Moreover, on both MATH-500 and AIME24, a 0.5B LLM outperforms GPT-4o, a 3B LLM surpasses a 405B LLM, and a 7B LLM beats o1 and DeepSeek-R1, while with higher inference efficiency. These findings show the significance of adapting TTS strategies to the specific characteristics of each task and model and indicate that TTS is a promising approach for enhancing the reasoning abilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06703v1-abstract-full').style.display = 'none'; document.getElementById('2502.06703v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04696">arXiv:2502.04696</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.04696">pdf</a>, <a href="https://arxiv.org/format/2502.04696">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Learning-based Model Predictive Control Strategy for Drift Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+C">Cheng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+J">Jun Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhouheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Betz%2C+J">Johannes Betz</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+H">Hongye Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04696v1-abstract-short" style="display: inline;"> Drift vehicle control offers valuable insights to support safe autonomous driving in extreme conditions, which hinges on tracking a particular path while maintaining the vehicle states near the drift equilibrium points (DEP). However, conventional tracking methods are not adaptable for drift vehicles due to their opposite steering angle and yaw rate. In this paper, we propose an adaptive path trac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04696v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04696v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04696v1-abstract-full" style="display: none;"> Drift vehicle control offers valuable insights to support safe autonomous driving in extreme conditions, which hinges on tracking a particular path while maintaining the vehicle states near the drift equilibrium points (DEP). However, conventional tracking methods are not adaptable for drift vehicles due to their opposite steering angle and yaw rate. In this paper, we propose an adaptive path tracking (APT) control method to dynamically adjust drift states to follow the reference path, improving the commonly utilized predictive path tracking methods with released computation burden. Furthermore, existing control strategies necessitate a precise system model to calculate the DEP, which can be more intractable due to the highly nonlinear drift dynamics and sensitive vehicle parameters. To tackle this problem, an adaptive learning-based model predictive control (ALMPC) strategy is proposed based on the APT method, where an upper-level Bayesian optimization is employed to learn the DEP and APT control law to instruct a lower-level MPC drift controller. This hierarchical system architecture can also resolve the inherent control conflict between path tracking and drifting by separating these objectives into different layers. The ALMPC strategy is verified on the Matlab-Carsim platform, and simulation results demonstrate its effectiveness in controlling the drift vehicle to follow a clothoid-based reference path even with the misidentified road friction parameter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04696v1-abstract-full').style.display = 'none'; document.getElementById('2502.04696v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03369">arXiv:2502.03369</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.03369">pdf</a>, <a href="https://arxiv.org/format/2502.03369">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning from Active Human Involvement through Proxy Value Propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhenghao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+W">Wenjie Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+C">Chenda Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Quanyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03369v1-abstract-short" style="display: inline;"> Learning from active human involvement enables the human subject to actively intervene and demonstrate to the AI agent during training. The interaction and corrective feedback from human brings safety and AI alignment to the learning process. In this work, we propose a new reward-free active human involvement method called Proxy Value Propagation for policy optimization. Our key insight is that a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03369v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03369v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03369v1-abstract-full" style="display: none;"> Learning from active human involvement enables the human subject to actively intervene and demonstrate to the AI agent during training. The interaction and corrective feedback from human brings safety and AI alignment to the learning process. In this work, we propose a new reward-free active human involvement method called Proxy Value Propagation for policy optimization. Our key insight is that a proxy value function can be designed to express human intents, wherein state-action pairs in the human demonstration are labeled with high values, while those agents&#39; actions that are intervened receive low values. Through the TD-learning framework, labeled values of demonstrated state-action pairs are further propagated to other unlabeled data generated from agents&#39; exploration. The proxy value function thus induces a policy that faithfully emulates human behaviors. Human-in-the-loop experiments show the generality and efficiency of our method. With minimal modification to existing reinforcement learning algorithms, our method can learn to solve continuous and discrete control tasks with various human control devices, including the challenging task of driving in Grand Theft Auto V. Demo video and code are available at: https://metadriverse.github.io/pvp <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03369v1-abstract-full').style.display = 'none'; document.getElementById('2502.03369v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023 Spotlight. Project page: https://metadriverse.github.io/pvp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01456">arXiv:2502.01456</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.01456">pdf</a>, <a href="https://arxiv.org/format/2502.01456">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Process Reinforcement through Implicit Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cui%2C+G">Ganqu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Lifan Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zefan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hanbin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wendi Li</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Bingxiang He</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tianyu Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Q">Qixin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Jiarui Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01456v1-abstract-short" style="display: inline;"> Dense process rewards have proven a more effective alternative to the sparse outcome-level rewards in the inference-time scaling of large language models (LLMs), particularly in tasks requiring complex multi-step reasoning. While dense rewards also offer an appealing choice for the reinforcement learning (RL) of LLMs since their fine-grained rewards have the potential to address some inherent issu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01456v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01456v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01456v1-abstract-full" style="display: none;"> Dense process rewards have proven a more effective alternative to the sparse outcome-level rewards in the inference-time scaling of large language models (LLMs), particularly in tasks requiring complex multi-step reasoning. While dense rewards also offer an appealing choice for the reinforcement learning (RL) of LLMs since their fine-grained rewards have the potential to address some inherent issues of outcome rewards, such as training efficiency and credit assignment, this potential remains largely unrealized. This can be primarily attributed to the challenges of training process reward models (PRMs) online, where collecting high-quality process labels is prohibitively expensive, making them particularly vulnerable to reward hacking. To address these challenges, we propose PRIME (Process Reinforcement through IMplicit rEwards), which enables online PRM updates using only policy rollouts and outcome labels through implict process rewards. PRIME combines well with various advantage functions and forgoes the dedicated reward model training phrase that existing approaches require, substantially reducing the development overhead. We demonstrate PRIME&#39;s effectiveness on competitional math and coding. Starting from Qwen2.5-Math-7B-Base, PRIME achieves a 15.1% average improvement across several key reasoning benchmarks over the SFT model. Notably, our resulting model, Eurus-2-7B-PRIME, surpasses Qwen2.5-Math-7B-Instruct on seven reasoning benchmarks with 10% of its training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01456v1-abstract-full').style.display = 'none'; document.getElementById('2502.01456v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages. Model&amp;Code&amp;Data available at https://github.com/PRIME-RL/PRIME</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00996">arXiv:2502.00996</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00996">pdf</a>, <a href="https://arxiv.org/format/2502.00996">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised Analogical Learning using Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Ben Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+S">Sarthak Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ning%2C+Q">Qiang Ning</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Benajiba%2C+Y">Yassine Benajiba</a>, <a href="/search/cs?searchtype=author&amp;query=Roth%2C+D">Dan Roth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00996v1-abstract-short" style="display: inline;"> Large language models have been shown to suffer from reasoning inconsistency issues. That is, they fail more in situations unfamiliar to the training data, even though exact or very similar reasoning paths exist in more common cases that they can successfully solve. Such observations motivate us to propose methods that encourage models to understand the high-level and abstract reasoning processes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00996v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00996v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00996v1-abstract-full" style="display: none;"> Large language models have been shown to suffer from reasoning inconsistency issues. That is, they fail more in situations unfamiliar to the training data, even though exact or very similar reasoning paths exist in more common cases that they can successfully solve. Such observations motivate us to propose methods that encourage models to understand the high-level and abstract reasoning processes during training instead of only the final answer. This way, models can transfer the exact solution to similar cases, regardless of their relevance to the pre-training data distribution. In this work, we propose SAL, a self-supervised analogical learning framework. SAL mimics the human analogy process and trains models to explicitly transfer high-quality symbolic solutions from cases that they know how to solve to other rare cases in which they tend to fail more. We show that the resulting models after SAL learning outperform base language models on a wide range of reasoning benchmarks, such as StrategyQA, GSM8K, and HotpotQA, by 2% to 20%. At the same time, we show that our model is more generalizable and controllable through analytical studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00996v1-abstract-full').style.display = 'none'; document.getElementById('2502.00996v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00386">arXiv:2502.00386</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00386">pdf</a>, <a href="https://arxiv.org/format/2502.00386">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Adaptive Label Refinement for Label Noise Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wenzhen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+D">Debo Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+G">Guangquan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaye Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shichao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00386v1-abstract-short" style="display: inline;"> Deep neural networks are highly susceptible to overfitting noisy labels, which leads to degraded performance. Existing methods address this issue by employing manually defined criteria, aiming to achieve optimal partitioning in each iteration to avoid fitting noisy labels while thoroughly learning clean samples. However, this often results in overly complex and difficult-to-train models. To addres&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00386v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00386v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00386v1-abstract-full" style="display: none;"> Deep neural networks are highly susceptible to overfitting noisy labels, which leads to degraded performance. Existing methods address this issue by employing manually defined criteria, aiming to achieve optimal partitioning in each iteration to avoid fitting noisy labels while thoroughly learning clean samples. However, this often results in overly complex and difficult-to-train models. To address this issue, we decouple the tasks of avoiding fitting incorrect labels and thoroughly learning clean samples and propose a simple yet highly applicable method called Adaptive Label Refinement (ALR). First, inspired by label refurbishment techniques, we update the original hard labels to soft labels using the model&#39;s predictions to reduce the risk of fitting incorrect labels. Then, by introducing the entropy loss, we gradually `harden&#39; the high-confidence soft labels, guiding the model to better learn from clean samples. This approach is simple and efficient, requiring no prior knowledge of noise or auxiliary datasets, making it more accessible compared to existing methods. We validate ALR&#39;s effectiveness through experiments on benchmark datasets with artificial label noise (CIFAR-10/100) and real-world datasets with inherent noise (ANIMAL-10N, Clothing1M, WebVision). The results show that ALR outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00386v1-abstract-full').style.display = 'none'; document.getElementById('2502.00386v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19318">arXiv:2501.19318</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.19318">pdf</a>, <a href="https://arxiv.org/format/2501.19318">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MINDSTORES: Memory-Informed Neural Decision Synthesis for Task-Oriented Reinforcement in Embodied Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chari%2C+A">Anirudh Chari</a>, <a href="/search/cs?searchtype=author&amp;query=Reddy%2C+S">Suraj Reddy</a>, <a href="/search/cs?searchtype=author&amp;query=Tiwari%2C+A">Aditya Tiwari</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+R">Richard Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Brian Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19318v1-abstract-short" style="display: inline;"> While large language models (LLMs) have shown promising capabilities as zero-shot planners for embodied agents, their inability to learn from experience and build persistent mental models limits their robustness in complex open-world environments like Minecraft. We introduce MINDSTORES, an experience-augmented planning framework that enables embodied agents to build and leverage mental models thro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19318v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19318v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19318v1-abstract-full" style="display: none;"> While large language models (LLMs) have shown promising capabilities as zero-shot planners for embodied agents, their inability to learn from experience and build persistent mental models limits their robustness in complex open-world environments like Minecraft. We introduce MINDSTORES, an experience-augmented planning framework that enables embodied agents to build and leverage mental models through natural interaction with their environment. Drawing inspiration from how humans construct and refine cognitive mental models, our approach extends existing zero-shot LLM planning by maintaining a database of past experiences that informs future planning iterations. The key innovation is representing accumulated experiences as natural language embeddings of (state, task, plan, outcome) tuples, which can then be efficiently retrieved and reasoned over by an LLM planner to generate insights and guide plan refinement for novel states and tasks. Through extensive experiments in the MineDojo environment, a simulation environment for agents in Minecraft that provides low-level controls for Minecraft, we find that MINDSTORES learns and applies its knowledge significantly better than existing memory-based LLM planners while maintaining the flexibility and generalization benefits of zero-shot approaches, representing an important step toward more capable embodied AI systems that can learn continuously through natural experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19318v1-abstract-full').style.display = 'none'; document.getElementById('2501.19318v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19278">arXiv:2501.19278</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.19278">pdf</a>, <a href="https://arxiv.org/ps/2501.19278">ps</a>, <a href="https://arxiv.org/format/2501.19278">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pheromone-based Learning of Optimal Reasoning Paths </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chari%2C+A">Anirudh Chari</a>, <a href="/search/cs?searchtype=author&amp;query=Tiwari%2C+A">Aditya Tiwari</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+R">Richard Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Reddy%2C+S">Suraj Reddy</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Brian Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19278v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable reasoning capabilities through chain-of-thought prompting, yet discovering effective reasoning methods for complex problems remains challenging due to the vast space of possible intermediate steps. We introduce Ant Colony Optimization-guided Tree of Thought (ACO-ToT), a novel algorithm that combines ACO with LLMs to discover optimal reasoni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19278v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19278v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19278v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable reasoning capabilities through chain-of-thought prompting, yet discovering effective reasoning methods for complex problems remains challenging due to the vast space of possible intermediate steps. We introduce Ant Colony Optimization-guided Tree of Thought (ACO-ToT), a novel algorithm that combines ACO with LLMs to discover optimal reasoning paths for complex problems efficiently. Drawing inspiration from Hebbian learning in neurological systems, our method employs a collection of distinctly fine-tuned LLM &#34;ants&#34; to traverse and lay pheromone trails through a centralized tree of thought, with each ant&#39;s movement governed by a weighted combination of existing pheromone trails and its own specialized expertise. The algorithm evaluates complete reasoning paths using a mixture-of-experts-based scoring function, with pheromones reinforcing productive reasoning paths across iterations. Experiments on three challenging reasoning tasks (GSM8K, ARC-Challenge, and MATH) demonstrate that ACO-ToT performs significantly better than existing chain-of-thought optimization approaches, suggesting that incorporating biologically inspired collective search mechanisms into LLM inference can substantially enhance reasoning capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19278v1-abstract-full').style.display = 'none'; document.getElementById('2501.19278v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18362">arXiv:2501.18362</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.18362">pdf</a>, <a href="https://arxiv.org/format/2501.18362">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MedXpertQA: Benchmarking Expert-Level Medical Reasoning and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+Y">Yuxin Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+S">Shang Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhangren Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xuekai Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18362v1-abstract-short" style="display: inline;"> We introduce MedXpertQA, a highly challenging and comprehensive benchmark to evaluate expert-level medical knowledge and advanced reasoning. MedXpertQA includes 4,460 questions spanning 17 specialties and 11 body systems. It includes two subsets, Text for text evaluation and MM for multimodal evaluation. Notably, MM introduces expert-level exam questions with diverse images and rich clinical infor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18362v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18362v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18362v1-abstract-full" style="display: none;"> We introduce MedXpertQA, a highly challenging and comprehensive benchmark to evaluate expert-level medical knowledge and advanced reasoning. MedXpertQA includes 4,460 questions spanning 17 specialties and 11 body systems. It includes two subsets, Text for text evaluation and MM for multimodal evaluation. Notably, MM introduces expert-level exam questions with diverse images and rich clinical information, including patient records and examination results, setting it apart from traditional medical multimodal benchmarks with simple QA pairs generated from image captions. MedXpertQA applies rigorous filtering and augmentation to address the insufficient difficulty of existing benchmarks like MedQA, and incorporates specialty board questions to improve clinical relevance and comprehensiveness. We perform data synthesis to mitigate data leakage risk and conduct multiple rounds of expert reviews to ensure accuracy and reliability. We evaluate 16 leading models on MedXpertQA. Moreover, medicine is deeply connected to real-world decision-making, providing a rich and representative setting for assessing reasoning abilities beyond mathematics and code. To this end, we develop a reasoning-oriented subset to facilitate the assessment of o1-like models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18362v1-abstract-full').style.display = 'none'; document.getElementById('2501.18362v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14336">arXiv:2501.14336</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.14336">pdf</a>, <a href="https://arxiv.org/format/2501.14336">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3650200.3656596">10.1145/3650200.3656596 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RadiK: Scalable and Optimized GPU-Parallel Radix Top-K Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bole Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiejing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+X">Xuechao Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yinghan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yingda Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14336v1-abstract-short" style="display: inline;"> Top-k selection, which identifies the largest or smallest k elements from a data set, is a fundamental operation in data-intensive domains such as databases and deep learning, so its scalability and efficiency are critical for these high-performance systems. However, previous studies on its efficient GPU implementation are mostly merge-based and rely heavily on the fast but size-limited on-chip me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14336v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14336v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14336v1-abstract-full" style="display: none;"> Top-k selection, which identifies the largest or smallest k elements from a data set, is a fundamental operation in data-intensive domains such as databases and deep learning, so its scalability and efficiency are critical for these high-performance systems. However, previous studies on its efficient GPU implementation are mostly merge-based and rely heavily on the fast but size-limited on-chip memory, thereby limiting the scalability with a restricted upper bound on k. This work introduces a scalable and optimized GPU-parallel radix top-k selection that supports significantly larger k values than existing methods without compromising efficiency, regardless of input length and batch size. Our method incorporates a novel optimization framework tailored for high memory bandwidth and resource utilization, achieving up to 2.5x speedup over the prior art for non-batch queries and up to 4.8x speedup for batch queries. In addition, we propose an adaptive scaling technique that strengthens the robustness, which further provides up to 2.7x speedup on highly adversarial input distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14336v1-abstract-full').style.display = 'none'; document.getElementById('2501.14336v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at the 38th ACM International Conference on Supercomputing (ICS &#39;24)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICS &#39;24: Proceedings of the 38th ACM International Conference on Supercomputing, 2024, Pages 537-548 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13876">arXiv:2501.13876</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.13876">pdf</a>, <a href="https://arxiv.org/format/2501.13876">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FAST-LIVO2 on Resource-Constrained Platforms: LiDAR-Inertial-Visual Odometry with Efficient Memory and Computation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bingyang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+C">Chunran Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziming Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+F">Fangcheng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Y">Yixi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13876v1-abstract-short" style="display: inline;"> This paper presents a lightweight LiDAR-inertial-visual odometry system optimized for resource-constrained platforms. It integrates a degeneration-aware adaptive visual frame selector into error-state iterated Kalman filter (ESIKF) with sequential updates, improving computation efficiency significantly while maintaining a similar level of robustness. Additionally, a memory-efficient mapping struct&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13876v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13876v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13876v1-abstract-full" style="display: none;"> This paper presents a lightweight LiDAR-inertial-visual odometry system optimized for resource-constrained platforms. It integrates a degeneration-aware adaptive visual frame selector into error-state iterated Kalman filter (ESIKF) with sequential updates, improving computation efficiency significantly while maintaining a similar level of robustness. Additionally, a memory-efficient mapping structure combining a locally unified visual-LiDAR map and a long-term visual map achieves a good trade-off between performance and memory usage. Extensive experiments on x86 and ARM platforms demonstrate the system&#39;s robustness and efficiency. On the Hilti dataset, our system achieves a 33% reduction in per-frame runtime and 47% lower memory usage compared to FAST-LIVO2, with only a 3 cm increase in RMSE. Despite this slight accuracy trade-off, our system remains competitive, outperforming state-of-the-art (SOTA) LIO methods such as FAST-LIO2 and most existing LIVO systems. These results validate the system&#39;s capability for scalable deployment on resource-constrained edge computing platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13876v1-abstract-full').style.display = 'none'; document.getElementById('2501.13876v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09167">arXiv:2501.09167</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.09167">pdf</a>, <a href="https://arxiv.org/format/2501.09167">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Embodied Scene Understanding for Vision Language Models via MetaVQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weizhen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+C">Chenda Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhenghao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuxin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09167v1-abstract-short" style="display: inline;"> Vision Language Models (VLMs) demonstrate significant potential as embodied AI agents for various mobility applications. However, a standardized, closed-loop benchmark for evaluating their spatial reasoning and sequential decision-making capabilities is lacking. To address this, we present MetaVQA: a comprehensive benchmark designed to assess and enhance VLMs&#39; understanding of spatial relationship&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09167v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09167v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09167v1-abstract-full" style="display: none;"> Vision Language Models (VLMs) demonstrate significant potential as embodied AI agents for various mobility applications. However, a standardized, closed-loop benchmark for evaluating their spatial reasoning and sequential decision-making capabilities is lacking. To address this, we present MetaVQA: a comprehensive benchmark designed to assess and enhance VLMs&#39; understanding of spatial relationships and scene dynamics through Visual Question Answering (VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and top-down view ground-truth annotations from nuScenes and Waymo datasets to automatically generate extensive question-answer pairs based on diverse real-world traffic scenarios, ensuring object-centric and context-rich instructions. Our experiments show that fine-tuning VLMs with the MetaVQA dataset significantly improves their spatial reasoning and embodied scene comprehension in safety-critical simulations, evident not only in improved VQA accuracies but also in emerging safety-aware driving maneuvers. In addition, the learning demonstrates strong transferability from simulation to real-world observation. Code and data will be publicly available at https://metadriverse.github.io/metavqa . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09167v1-abstract-full').style.display = 'none'; document.getElementById('2501.09167v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">for the project webpage, see https://metadriverse.github.io/metavqa</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07408">arXiv:2501.07408</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.07408">pdf</a>, <a href="https://arxiv.org/format/2501.07408">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Initial Findings on Sensor based Open Vocabulary Activity Recognition via Text Embedding Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ray%2C+L+S+S">Lala Shakti Swarup Ray</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Suh%2C+S">Sungho Suh</a>, <a href="/search/cs?searchtype=author&amp;query=Lukowicz%2C+P">Paul Lukowicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07408v1-abstract-short" style="display: inline;"> Conventional human activity recognition (HAR) relies on classifiers trained to predict discrete activity classes, inherently limiting recognition to activities explicitly present in the training set. Such classifiers would invariably fail, putting zero likelihood, when encountering unseen activities. We propose Open Vocabulary HAR (OV-HAR), a framework that overcomes this limitation by first conve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07408v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07408v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07408v1-abstract-full" style="display: none;"> Conventional human activity recognition (HAR) relies on classifiers trained to predict discrete activity classes, inherently limiting recognition to activities explicitly present in the training set. Such classifiers would invariably fail, putting zero likelihood, when encountering unseen activities. We propose Open Vocabulary HAR (OV-HAR), a framework that overcomes this limitation by first converting each activity into natural language and breaking it into a sequence of elementary motions. This descriptive text is then encoded into a fixed-size embedding. The model is trained to regress this embedding, which is subsequently decoded back into natural language using a pre-trained embedding inversion model. Unlike other works that rely on auto-regressive large language models (LLMs) at their core, OV-HAR achieves open vocabulary recognition without the computational overhead of such models. The generated text can be transformed into a single activity class using LLM prompt engineering. We have evaluated our approach on different modalities, including vision (pose), IMU, and pressure sensors, demonstrating robust generalization across unseen activities and modalities, offering a fundamentally different paradigm from contemporary classifiers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07408v1-abstract-full').style.display = 'none'; document.getElementById('2501.07408v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06693">arXiv:2501.06693</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.06693">pdf</a>, <a href="https://arxiv.org/format/2501.06693">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Vid2Sim: Realistic and Interactive Simulation from Video for Urban Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Z">Ziyang Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhizheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhenghao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wayne Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06693v2-abstract-short" style="display: inline;"> Sim-to-real gap has long posed a significant challenge for robot learning in simulation, preventing the deployment of learned models in the real world. Previous work has primarily focused on domain randomization and system identification to mitigate this gap. However, these methods are often limited by the inherent constraints of the simulation and graphics engines. In this work, we propose Vid2Si&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06693v2-abstract-full').style.display = 'inline'; document.getElementById('2501.06693v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06693v2-abstract-full" style="display: none;"> Sim-to-real gap has long posed a significant challenge for robot learning in simulation, preventing the deployment of learned models in the real world. Previous work has primarily focused on domain randomization and system identification to mitigate this gap. However, these methods are often limited by the inherent constraints of the simulation and graphics engines. In this work, we propose Vid2Sim, a novel framework that effectively bridges the sim2real gap through a scalable and cost-efficient real2sim pipeline for neural 3D scene reconstruction and simulation. Given a monocular video as input, Vid2Sim can generate photorealistic and physically interactable 3D simulation environments to enable the reinforcement learning of visual navigation agents in complex urban environments. Extensive experiments demonstrate that Vid2Sim significantly improves the performance of urban navigation in the digital twins and real world by 31.2% and 68.3% in success rate compared with agents trained with prior simulation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06693v2-abstract-full').style.display = 'none'; document.getElementById('2501.06693v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://metadriverse.github.io/vid2sim/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06566">arXiv:2501.06566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.06566">pdf</a>, <a href="https://arxiv.org/format/2501.06566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Cooperative Aerial Robot Inspection Challenge: A Benchmark for Heterogeneous Multi-UAV Planning and Lessons Learned </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+M">Muqing Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Thien-Minh Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Anastasiou%2C+A">Andreas Anastasiou</a>, <a href="/search/cs?searchtype=author&amp;query=Zacharia%2C+A">Angelos Zacharia</a>, <a href="/search/cs?searchtype=author&amp;query=Papaioannou%2C+S">Savvas Papaioannou</a>, <a href="/search/cs?searchtype=author&amp;query=Kolios%2C+P">Panayiotis Kolios</a>, <a href="/search/cs?searchtype=author&amp;query=Panayiotou%2C+C+G">Christos G. Panayiotou</a>, <a href="/search/cs?searchtype=author&amp;query=Polycarpou%2C+M+M">Marios M. Polycarpou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xinhang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mingjie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+F">Fei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Boyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B+M">Ben M. Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Lihua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06566v2-abstract-short" style="display: inline;"> We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a simulation-based benchmark for motion planning algorithms in heterogeneous multi-UAV systems. CARIC features UAV teams with complementary sensors, realistic constraints, and evaluation metrics prioritizing inspection quality and efficiency. It offers a ready-to-use perception-control software stack and diverse scenarios to sup&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06566v2-abstract-full').style.display = 'inline'; document.getElementById('2501.06566v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06566v2-abstract-full" style="display: none;"> We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a simulation-based benchmark for motion planning algorithms in heterogeneous multi-UAV systems. CARIC features UAV teams with complementary sensors, realistic constraints, and evaluation metrics prioritizing inspection quality and efficiency. It offers a ready-to-use perception-control software stack and diverse scenarios to support the development and evaluation of task allocation and motion planning algorithms. Competitions using CARIC were held at IEEE CDC 2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation, attracting innovative solutions from research teams worldwide. This paper examines the top three teams from CDC 2023, analyzing their exploration, inspection, and task allocation strategies while drawing insights into their performance across scenarios. The results highlight the task&#39;s complexity and suggest promising directions for future research in cooperative multi-UAV systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06566v2-abstract-full').style.display = 'none'; document.getElementById('2501.06566v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Please find our website at https://ntu-aris.github.io/caric</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06474">arXiv:2501.06474</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.06474">pdf</a>, <a href="https://arxiv.org/format/2501.06474">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The 1st SpeechWellness Challenge: Detecting Suicidal Risk Among Adolescents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Z">Ziyun Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+C">Chang Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Y">Yinan Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+D">Diyang Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Ji Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Runsen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06474v1-abstract-short" style="display: inline;"> The 1st SpeechWellness Challenge (SW1) aims to advance methods for detecting suicidal risk in adolescents using speech analysis techniques. Suicide among adolescents is a critical public health issue globally. Early detection of suicidal tendencies can lead to timely intervention and potentially save lives. Traditional methods of assessment often rely on self-reporting or clinical interviews, whic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06474v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06474v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06474v1-abstract-full" style="display: none;"> The 1st SpeechWellness Challenge (SW1) aims to advance methods for detecting suicidal risk in adolescents using speech analysis techniques. Suicide among adolescents is a critical public health issue globally. Early detection of suicidal tendencies can lead to timely intervention and potentially save lives. Traditional methods of assessment often rely on self-reporting or clinical interviews, which may not always be accessible. The SW1 challenge addresses this gap by exploring speech as a non-invasive and readily available indicator of mental health. We release the SW1 dataset which contains speech recordings from 600 adolescents aged 10-18 years. By focusing on speech generated from natural tasks, the challenge seeks to uncover patterns and markers that correlate with suicidal risk. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06474v1-abstract-full').style.display = 'none'; document.getElementById('2501.06474v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03916">arXiv:2501.03916</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03916">pdf</a>, <a href="https://arxiv.org/format/2501.03916">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03916v2-abstract-short" style="display: inline;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03916v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03916v2-abstract-full" style="display: none;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'none'; document.getElementById('2501.03916v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures, and our homepage: https://alpha-innovator.github.io/Dolphin-project-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02815">arXiv:2501.02815</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.02815">pdf</a>, <a href="https://arxiv.org/format/2501.02815">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Local Reactive Control for Mobile Manipulators with Whole-Body Safety in Complex Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+C">Chunxin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yulin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zhiyuan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+Z">Zhihai Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jinni Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Boyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02815v1-abstract-short" style="display: inline;"> Mobile manipulators typically encounter significant challenges in navigating narrow, cluttered environments due to their high-dimensional state spaces and complex kinematics. While reactive methods excel in dynamic settings, they struggle to efficiently incorporate complex, coupled constraints across the entire state space. In this work, we present a novel local reactive controller that reformulat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02815v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02815v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02815v1-abstract-full" style="display: none;"> Mobile manipulators typically encounter significant challenges in navigating narrow, cluttered environments due to their high-dimensional state spaces and complex kinematics. While reactive methods excel in dynamic settings, they struggle to efficiently incorporate complex, coupled constraints across the entire state space. In this work, we present a novel local reactive controller that reformulates the time-domain single-step problem into a multi-step optimization problem in the spatial domain, leveraging the propagation of a serial kinematic chain. This transformation facilitates the formulation of customized, decoupled link-specific constraints, which is further solved efficiently with augmented Lagrangian differential dynamic programming (AL-DDP). Our approach naturally absorbs spatial kinematic propagation in the forward pass and processes all link-specific constraints simultaneously during the backward pass, enhancing both constraint management and computational efficiency. Notably, in this framework, we formulate collision avoidance constraints for each link using accurate geometric models with extracted free regions, and this improves the maneuverability of the mobile manipulator in narrow, cluttered spaces. Experimental results showcase significant improvements in safety, efficiency, and task completion rates. These findings underscore the robustness of the proposed method, particularly in narrow, cluttered environments where conventional approaches could falter. The open-source project can be found at https://github.com/Chunx1nZHENG/MM-with-Whole-Body-Safety-Release.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02815v1-abstract-full').style.display = 'none'; document.getElementById('2501.02815v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02158">arXiv:2501.02158</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.02158">pdf</a>, <a href="https://arxiv.org/format/2501.02158">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Joint Optimization for 4D Human-Scene Reconstruction in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhizheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Joe Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wayne Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02158v1-abstract-short" style="display: inline;"> Reconstructing human motion and its surrounding environment is crucial for understanding human-scene interaction and predicting human movements in the scene. While much progress has been made in capturing human-scene interaction in constrained environments, those prior methods can hardly reconstruct the natural and diverse human motion and scene context from web videos. In this work, we propose JO&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02158v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02158v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02158v1-abstract-full" style="display: none;"> Reconstructing human motion and its surrounding environment is crucial for understanding human-scene interaction and predicting human movements in the scene. While much progress has been made in capturing human-scene interaction in constrained environments, those prior methods can hardly reconstruct the natural and diverse human motion and scene context from web videos. In this work, we propose JOSH, a novel optimization-based method for 4D human-scene reconstruction in the wild from monocular videos. JOSH uses techniques in both dense scene reconstruction and human mesh recovery as initialization, and then it leverages the human-scene contact constraints to jointly optimize the scene, the camera poses, and the human motion. Experiment results show JOSH achieves better results on both global human motion estimation and dense scene reconstruction by joint optimization of scene geometry and human motion. We further design a more efficient model, JOSH3R, and directly train it with pseudo-labels from web videos. JOSH3R outperforms other optimization-free methods by only training with labels predicted from JOSH, further demonstrating its accuracy and generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02158v1-abstract-full').style.display = 'none'; document.getElementById('2501.02158v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://genforce.github.io/JOSH/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01990">arXiv:2501.01990</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.01990">pdf</a>, <a href="https://arxiv.org/format/2501.01990">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Towards Sustainable Large Language Model Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+S">Sophia Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Beihao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Sihang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01990v1-abstract-short" style="display: inline;"> In this work, we study LLMs from a carbon emission perspective, addressing both operational and embodied emissions, and paving the way for sustainable LLM serving. We characterize the performance and energy of LLaMA with 1B, 3B, and 7B parameters using two Nvidia GPU types, a latest-generation RTX6000 Ada and an older-generation T4. We analytically model operational carbon emissions based on energ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01990v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01990v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01990v1-abstract-full" style="display: none;"> In this work, we study LLMs from a carbon emission perspective, addressing both operational and embodied emissions, and paving the way for sustainable LLM serving. We characterize the performance and energy of LLaMA with 1B, 3B, and 7B parameters using two Nvidia GPU types, a latest-generation RTX6000 Ada and an older-generation T4. We analytically model operational carbon emissions based on energy consumption and carbon intensities from three grid regions -- each representing a different energy source mix, and embodied carbon emissions based on chip area and memory size. Our characterization and modeling provide us with an in-depth understanding of the performance, energy, and carbon emissions of LLM serving. Our findings highlight the potential for optimizing sustainable LLM serving systems by considering both operational and embodied carbon emissions simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01990v1-abstract-full').style.display = 'none'; document.getElementById('2501.01990v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00432">arXiv:2501.00432</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.00432">pdf</a>, <a href="https://arxiv.org/format/2501.00432">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OV-HHIR: Open Vocabulary Human Interaction Recognition Using Cross-modal Integration of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ray%2C+L+S+S">Lala Shakti Swarup Ray</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Suh%2C+S">Sungho Suh</a>, <a href="/search/cs?searchtype=author&amp;query=Lukowicz%2C+P">Paul Lukowicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00432v1-abstract-short" style="display: inline;"> Understanding human-to-human interactions, especially in contexts like public security surveillance, is critical for monitoring and maintaining safety. Traditional activity recognition systems are limited by fixed vocabularies, predefined labels, and rigid interaction categories that often rely on choreographed videos and overlook concurrent interactive groups. These limitations make such systems&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00432v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00432v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00432v1-abstract-full" style="display: none;"> Understanding human-to-human interactions, especially in contexts like public security surveillance, is critical for monitoring and maintaining safety. Traditional activity recognition systems are limited by fixed vocabularies, predefined labels, and rigid interaction categories that often rely on choreographed videos and overlook concurrent interactive groups. These limitations make such systems less adaptable to real-world scenarios, where interactions are diverse and unpredictable. In this paper, we propose an open vocabulary human-to-human interaction recognition (OV-HHIR) framework that leverages large language models to generate open-ended textual descriptions of both seen and unseen human interactions in open-world settings without being confined to a fixed vocabulary. Additionally, we create a comprehensive, large-scale human-to-human interaction dataset by standardizing and combining existing public human interaction datasets into a unified benchmark. Extensive experiments demonstrate that our method outperforms traditional fixed-vocabulary classification systems and existing cross-modal language models for video understanding, setting the stage for more intelligent and adaptable visual understanding systems in surveillance and beyond. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00432v1-abstract-full').style.display = 'none'; document.getElementById('2501.00432v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17739">arXiv:2412.17739</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.17739">pdf</a>, <a href="https://arxiv.org/format/2412.17739">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fourier Position Embedding: Enhancing Attention&#39;s Periodic Extension for Length Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+C">Che Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Youbang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+B">Biqing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xuekai Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17739v2-abstract-short" style="display: inline;"> Extending the context length of Language Models (LMs) by improving Rotary Position Embedding (RoPE) has become a trend. While existing works mainly address RoPE&#39;s limitations within attention mechanism, this paper provides an analysis across nearly all parts of LMs, uncovering their adverse effects on length generalization for RoPE-based attention. Using Discrete Signal Processing theory, we show&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17739v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17739v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17739v2-abstract-full" style="display: none;"> Extending the context length of Language Models (LMs) by improving Rotary Position Embedding (RoPE) has become a trend. While existing works mainly address RoPE&#39;s limitations within attention mechanism, this paper provides an analysis across nearly all parts of LMs, uncovering their adverse effects on length generalization for RoPE-based attention. Using Discrete Signal Processing theory, we show that RoPE enables periodic attention by implicitly achieving Non-Uniform Discrete Fourier Transform. However, this periodicity is undermined by the spectral damage caused by: 1) linear layers and activation functions outside of attention; 2) insufficiently trained frequency components brought by time-domain truncation. Building on our observations, we propose Fourier Position Embedding (FoPE), which enhances attention&#39;s frequency-domain properties to improve both its periodic extension and length generalization. FoPE constructs Fourier Series and zero-outs the destructive frequency components, increasing model robustness against the spectrum damage. Experiments across various model scales show that, within varying context windows, FoPE can maintain a more stable perplexity and a more consistent accuracy in a needle-in-haystack task compared to RoPE and ALiBi. Several analyses and ablations bring further support to our method and theoretical modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17739v2-abstract-full').style.display = 'none'; document.getElementById('2412.17739v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16373">arXiv:2412.16373</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16373">pdf</a>, <a href="https://arxiv.org/format/2412.16373">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> FairREAD: Re-fusing Demographic Attributes after Disentanglement for Fair Medical Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yicheng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+J">Jinkui Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16373v1-abstract-short" style="display: inline;"> Recent advancements in deep learning have shown transformative potential in medical imaging, yet concerns about fairness persist due to performance disparities across demographic subgroups. Existing methods aim to address these biases by mitigating sensitive attributes in image data; however, these attributes often carry clinically relevant information, and their removal can compromise model perfo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16373v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16373v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16373v1-abstract-full" style="display: none;"> Recent advancements in deep learning have shown transformative potential in medical imaging, yet concerns about fairness persist due to performance disparities across demographic subgroups. Existing methods aim to address these biases by mitigating sensitive attributes in image data; however, these attributes often carry clinically relevant information, and their removal can compromise model performance-a highly undesirable outcome. To address this challenge, we propose Fair Re-fusion After Disentanglement (FairREAD), a novel, simple, and efficient framework that mitigates unfairness by re-integrating sensitive demographic attributes into fair image representations. FairREAD employs orthogonality constraints and adversarial training to disentangle demographic information while using a controlled re-fusion mechanism to preserve clinically relevant details. Additionally, subgroup-specific threshold adjustments ensure equitable performance across demographic groups. Comprehensive evaluations on a large-scale clinical X-ray dataset demonstrate that FairREAD significantly reduces unfairness metrics while maintaining diagnostic accuracy, establishing a new benchmark for fairness and performance in medical image classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16373v1-abstract-full').style.display = 'none'; document.getElementById('2412.16373v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Medical Image Analysis, code will be available after review is complete</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16212">arXiv:2412.16212</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16212">pdf</a>, <a href="https://arxiv.org/format/2412.16212">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ManiVideo: Generating Hand-Object Manipulation Video with Dexterous and Generalizable Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pang%2C+Y">Youxin Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+R">Ruizhi Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiajun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+H">Hanzhang Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Boyao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongwen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yebin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16212v1-abstract-short" style="display: inline;"> In this paper, we introduce ManiVideo, a novel method for generating consistent and temporally coherent bimanual hand-object manipulation videos from given motion sequences of hands and objects. The core idea of ManiVideo is the construction of a multi-layer occlusion (MLO) representation that learns 3D occlusion relationships from occlusion-free normal maps and occlusion confidence maps. By embed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16212v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16212v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16212v1-abstract-full" style="display: none;"> In this paper, we introduce ManiVideo, a novel method for generating consistent and temporally coherent bimanual hand-object manipulation videos from given motion sequences of hands and objects. The core idea of ManiVideo is the construction of a multi-layer occlusion (MLO) representation that learns 3D occlusion relationships from occlusion-free normal maps and occlusion confidence maps. By embedding the MLO structure into the UNet in two forms, the model enhances the 3D consistency of dexterous hand-object manipulation. To further achieve the generalizable grasping of objects, we integrate Objaverse, a large-scale 3D object dataset, to address the scarcity of video data, thereby facilitating the learning of extensive object consistency. Additionally, we propose an innovative training strategy that effectively integrates multiple datasets, supporting downstream tasks such as human-centric hand-object manipulation video generation. Through extensive experiments, we demonstrate that our approach not only achieves video generation with plausible hand-object interaction and generalizable objects, but also outperforms existing SOTA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16212v1-abstract-full').style.display = 'none'; document.getElementById('2412.16212v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15496">arXiv:2412.15496</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.15496">pdf</a>, <a href="https://arxiv.org/format/2412.15496">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding When and Why Graph Attention Mechanisms Work via Node Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zhongtian Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiaosheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bocheng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yexin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15496v1-abstract-short" style="display: inline;"> Despite the growing popularity of graph attention mechanisms, their theoretical understanding remains limited. This paper aims to explore the conditions under which these mechanisms are effective in node classification tasks through the lens of Contextual Stochastic Block Models (CSBMs). Our theoretical analysis reveals that incorporating graph attention mechanisms is \emph{not universally benefic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15496v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15496v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15496v1-abstract-full" style="display: none;"> Despite the growing popularity of graph attention mechanisms, their theoretical understanding remains limited. This paper aims to explore the conditions under which these mechanisms are effective in node classification tasks through the lens of Contextual Stochastic Block Models (CSBMs). Our theoretical analysis reveals that incorporating graph attention mechanisms is \emph{not universally beneficial}. Specifically, by appropriately defining \emph{structure noise} and \emph{feature noise} in graphs, we show that graph attention mechanisms can enhance classification performance when structure noise exceeds feature noise. Conversely, when feature noise predominates, simpler graph convolution operations are more effective. Furthermore, we examine the over-smoothing phenomenon and show that, in the high signal-to-noise ratio (SNR) regime, graph convolutional networks suffer from over-smoothing, whereas graph attention mechanisms can effectively resolve this issue. Building on these insights, we propose a novel multi-layer Graph Attention Network (GAT) architecture that significantly outperforms single-layer GATs in achieving \emph{perfect node classification} in CSBMs, relaxing the SNR requirement from $ 蠅(\sqrt{\log n}) $ to $ 蠅(\sqrt{\log n} / \sqrt[3]{n}) $. To our knowledge, this is the first study to delineate the conditions for perfect node classification using multi-layer GATs. Our theoretical contributions are corroborated by extensive experiments on both synthetic and real-world datasets, highlighting the practical implications of our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15496v1-abstract-full').style.display = 'none'; document.getElementById('2412.15496v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14689">arXiv:2412.14689</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14689">pdf</a>, <a href="https://arxiv.org/format/2412.14689">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How to Synthesize Text Data without Model Collapse? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xuekai Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+D">Daixuan Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hengli Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhouhan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zilong Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14689v1-abstract-short" style="display: inline;"> Model collapse in synthetic data indicates that iterative training on self-generated data leads to a gradual decline in performance. With the proliferation of AI models, synthetic data will fundamentally reshape the web data ecosystem. Future GPT-$\{n\}$ models will inevitably be trained on a blend of synthetic and human-produced data. In this paper, we focus on two questions: what is the impact o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14689v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14689v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14689v1-abstract-full" style="display: none;"> Model collapse in synthetic data indicates that iterative training on self-generated data leads to a gradual decline in performance. With the proliferation of AI models, synthetic data will fundamentally reshape the web data ecosystem. Future GPT-$\{n\}$ models will inevitably be trained on a blend of synthetic and human-produced data. In this paper, we focus on two questions: what is the impact of synthetic data on language model training, and how to synthesize data without model collapse? We first pre-train language models across different proportions of synthetic data, revealing a negative correlation between the proportion of synthetic data and model performance. We further conduct statistical analysis on synthetic data to uncover distributional shift phenomenon and over-concentration of n-gram features. Inspired by the above findings, we propose token editing on human-produced data to obtain semi-synthetic data. As a proof of concept, we theoretically demonstrate that token-level editing can prevent model collapse, as the test error is constrained by a finite upper bound. We conduct extensive experiments on pre-training from scratch, continual pre-training, and supervised fine-tuning. The results validate our theoretical proof that token-level editing improves data quality and enhances model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14689v1-abstract-full').style.display = 'none'; document.getElementById('2412.14689v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14186">arXiv:2412.14186</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14186">pdf</a>, <a href="https://arxiv.org/format/2412.14186">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards AI-$45^{\circ}$ Law: A Roadmap to Trustworthy AGI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+C">Chaochao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yingchun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14186v2-abstract-short" style="display: inline;"> Ensuring Artificial General Intelligence (AGI) reliably avoids harmful behaviors is a critical challenge, especially for systems with high autonomy or in safety-critical domains. Despite various safety assurance proposals and extreme risk warnings, comprehensive guidelines balancing AI safety and capability remain lacking. In this position paper, we propose the \textit{AI-\textbf{$45^{\circ}$} Law&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14186v2-abstract-full').style.display = 'inline'; document.getElementById('2412.14186v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14186v2-abstract-full" style="display: none;"> Ensuring Artificial General Intelligence (AGI) reliably avoids harmful behaviors is a critical challenge, especially for systems with high autonomy or in safety-critical domains. Despite various safety assurance proposals and extreme risk warnings, comprehensive guidelines balancing AI safety and capability remain lacking. In this position paper, we propose the \textit{AI-\textbf{$45^{\circ}$} Law} as a guiding principle for a balanced roadmap toward trustworthy AGI, and introduce the \textit{Causal Ladder of Trustworthy AGI} as a practical framework. This framework provides a systematic taxonomy and hierarchical structure for current AI capability and safety research, inspired by Judea Pearl&#39;s ``Ladder of Causation&#39;&#39;. The Causal Ladder comprises three core layers: the Approximate Alignment Layer, the Intervenable Layer, and the Reflectable Layer. These layers address the key challenges of safety and trustworthiness in AGI and contemporary AI systems. Building upon this framework, we define five levels of trustworthy AGI: perception, reasoning, decision-making, autonomy, and collaboration trustworthiness. These levels represent distinct yet progressive aspects of trustworthy AGI. Finally, we present a series of potential governance measures to support the development of trustworthy AGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14186v2-abstract-full').style.display = 'none'; document.getElementById('2412.14186v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13912">arXiv:2412.13912</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.13912">pdf</a>, <a href="https://arxiv.org/format/2412.13912">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Energy-Efficient SLAM via Joint Design of Sensing, Communication, and Exploration Speed </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zidong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+R">Ruibo Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bingpeng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qinyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+Y">Yi Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13912v1-abstract-short" style="display: inline;"> To support future spatial machine intelligence applications, lifelong simultaneous localization and mapping (SLAM) has drawn significant attentions. SLAM is usually realized based on various types of mobile robots performing simultaneous and continuous sensing and communication. This paper focuses on analyzing the energy efficiency of robot operation for lifelong SLAM by jointly considering sensin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13912v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13912v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13912v1-abstract-full" style="display: none;"> To support future spatial machine intelligence applications, lifelong simultaneous localization and mapping (SLAM) has drawn significant attentions. SLAM is usually realized based on various types of mobile robots performing simultaneous and continuous sensing and communication. This paper focuses on analyzing the energy efficiency of robot operation for lifelong SLAM by jointly considering sensing, communication and mechanical factors. The system model is built based on a robot equipped with a 2D light detection and ranging (LiDAR) and an odometry. The cloud point raw data as well as the odometry data are wirelessly transmitted to data center where real-time map reconstruction is realized based on an unsupervised deep learning based method. The sensing duration, transmit power, transmit duration and exploration speed are jointly optimized to minimize the energy consumption. Simulations and experiments demonstrate the performance of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13912v1-abstract-full').style.display = 'none'; document.getElementById('2412.13912v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09849">arXiv:2412.09849</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.09849">pdf</a>, <a href="https://arxiv.org/ps/2412.09849">ps</a>, <a href="https://arxiv.org/format/2412.09849">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning for Spectrum Prediction in Cognitive Radio Networks: State-of-the-Art, New Opportunities, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pan%2C+G">Guangliang Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yau%2C+D+K+Y">David K. Y. Yau</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qihui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09849v1-abstract-short" style="display: inline;"> Spectrum prediction is considered to be a promising technology that enhances spectrum efficiency by assisting dynamic spectrum access (DSA) in cognitive radio networks (CRN). Nonetheless, the highly nonlinear nature of spectrum data across time, frequency, and space domains, coupled with the intricate spectrum usage patterns, poses challenges for accurate spectrum prediction. Deep learning (DL), r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09849v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09849v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09849v1-abstract-full" style="display: none;"> Spectrum prediction is considered to be a promising technology that enhances spectrum efficiency by assisting dynamic spectrum access (DSA) in cognitive radio networks (CRN). Nonetheless, the highly nonlinear nature of spectrum data across time, frequency, and space domains, coupled with the intricate spectrum usage patterns, poses challenges for accurate spectrum prediction. Deep learning (DL), recognized for its capacity to extract nonlinear features, has been applied to solve these challenges. This paper first shows the advantages of applying DL by comparing with traditional prediction methods. Then, the current state-of-the-art DL-based spectrum prediction techniques are reviewed and summarized in terms of intra-band and crossband prediction. Notably, this paper uses a real-world spectrum dataset to prove the advancements of DL-based methods. Then, this paper proposes a novel intra-band spatiotemporal spectrum prediction framework named ViTransLSTM. This framework integrates visual self-attention and long short-term memory to capture both local and global long-term spatiotemporal dependencies of spectrum usage patterns. Similarly, the effectiveness of the proposed framework is validated on the aforementioned real-world dataset. Finally, the paper presents new related challenges and potential opportunities for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09849v1-abstract-full').style.display = 'none'; document.getElementById('2412.09849v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09037">arXiv:2412.09037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.09037">pdf</a>, <a href="https://arxiv.org/format/2412.09037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Confusion: A Fine-grained Dialectical Examination of Human Activity Recognition Benchmark Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Geissler%2C+D">Daniel Geissler</a>, <a href="/search/cs?searchtype=author&amp;query=Nshimyimana%2C+D">Dominique Nshimyimana</a>, <a href="/search/cs?searchtype=author&amp;query=Rey%2C+V+F">Vitor Fortes Rey</a>, <a href="/search/cs?searchtype=author&amp;query=Suh%2C+S">Sungho Suh</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lukowicz%2C+P">Paul Lukowicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09037v1-abstract-short" style="display: inline;"> The research of machine learning (ML) algorithms for human activity recognition (HAR) has made significant progress with publicly available datasets. However, most research prioritizes statistical metrics over examining negative sample details. While recent models like transformers have been applied to HAR datasets with limited success from the benchmark metrics, their counterparts have effectivel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09037v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09037v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09037v1-abstract-full" style="display: none;"> The research of machine learning (ML) algorithms for human activity recognition (HAR) has made significant progress with publicly available datasets. However, most research prioritizes statistical metrics over examining negative sample details. While recent models like transformers have been applied to HAR datasets with limited success from the benchmark metrics, their counterparts have effectively solved problems on similar levels with near 100% accuracy. This raises questions about the limitations of current approaches. This paper aims to address these open questions by conducting a fine-grained inspection of six popular HAR benchmark datasets. We identified for some parts of the data, none of the six chosen state-of-the-art ML methods can correctly classify, denoted as the intersect of false classifications (IFC). Analysis of the IFC reveals several underlying problems, including ambiguous annotations, irregularities during recording execution, and misaligned transition periods. We contribute to the field by quantifying and characterizing annotated data ambiguities, providing a trinary categorization mask for dataset patching, and stressing potential improvements for future data collections. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09037v1-abstract-full').style.display = 'none'; document.getElementById('2412.09037v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08526">arXiv:2412.08526</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08526">pdf</a>, <a href="https://arxiv.org/format/2412.08526">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Spend More to Save More (SM2): An Energy-Aware Implementation of Successive Halving for Sustainable Hyperparameter Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Geissler%2C+D">Daniel Geissler</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Suh%2C+S">Sungho Suh</a>, <a href="/search/cs?searchtype=author&amp;query=Lukowicz%2C+P">Paul Lukowicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08526v1-abstract-short" style="display: inline;"> A fundamental step in the development of machine learning models commonly involves the tuning of hyperparameters, often leading to multiple model training runs to work out the best-performing configuration. As machine learning tasks and models grow in complexity, there is an escalating need for solutions that not only improve performance but also address sustainability concerns. Existing strategie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08526v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08526v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08526v1-abstract-full" style="display: none;"> A fundamental step in the development of machine learning models commonly involves the tuning of hyperparameters, often leading to multiple model training runs to work out the best-performing configuration. As machine learning tasks and models grow in complexity, there is an escalating need for solutions that not only improve performance but also address sustainability concerns. Existing strategies predominantly focus on maximizing the performance of the model without considering energy efficiency. To bridge this gap, in this paper, we introduce Spend More to Save More (SM2), an energy-aware hyperparameter optimization implementation based on the widely adopted successive halving algorithm. Unlike conventional approaches including energy-intensive testing of individual hyperparameter configurations, SM2 employs exploratory pretraining to identify inefficient configurations with minimal energy expenditure. Incorporating hardware characteristics and real-time energy consumption tracking, SM2 identifies an optimal configuration that not only maximizes the performance of the model but also enables energy-efficient training. Experimental validations across various datasets, models, and hardware setups confirm the efficacy of SM2 to prevent the waste of energy during the training of hyperparameter configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08526v1-abstract-full').style.display = 'none'; document.getElementById('2412.08526v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08515">arXiv:2412.08515</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08515">pdf</a>, <a href="https://arxiv.org/format/2412.08515">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Interpretability Through Loss-Defined Classification Objective in Structured Latent Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Geissler%2C+D">Daniel Geissler</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Mengxi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lukowicz%2C+P">Paul Lukowicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08515v1-abstract-short" style="display: inline;"> Supervised machine learning often operates on the data-driven paradigm, wherein internal model parameters are autonomously optimized to converge predicted outputs with the ground truth, devoid of explicitly programming rules or a priori assumptions. Although data-driven methods have yielded notable successes across various benchmark datasets, they inherently treat models as opaque entities, thereb&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08515v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08515v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08515v1-abstract-full" style="display: none;"> Supervised machine learning often operates on the data-driven paradigm, wherein internal model parameters are autonomously optimized to converge predicted outputs with the ground truth, devoid of explicitly programming rules or a priori assumptions. Although data-driven methods have yielded notable successes across various benchmark datasets, they inherently treat models as opaque entities, thereby limiting their interpretability and yielding a lack of explanatory insights into their decision-making processes. In this work, we introduce Latent Boost, a novel approach that integrates advanced distance metric learning into supervised classification tasks, enhancing both interpretability and training efficiency. Thus during training, the model is not only optimized for classification metrics of the discrete data points but also adheres to the rule that the collective representation zones of each class should be sharply clustered. By leveraging the rich structural insights of intermediate model layer latent representations, Latent Boost improves classification interpretability, as demonstrated by higher Silhouette scores, while accelerating training convergence. These performance and latent structural benefits are achieved with minimum additional cost, making it broadly applicable across various datasets without requiring data-specific adjustments. Furthermore, Latent Boost introduces a new paradigm for aligning classification performance with improved model transparency to address the challenges of black-box models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08515v1-abstract-full').style.display = 'none'; document.getElementById('2412.08515v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08507">arXiv:2412.08507</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08507">pdf</a>, <a href="https://arxiv.org/format/2412.08507">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Strategies and Challenges of Efficient White-Box Training for Human Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Geissler%2C+D">Daniel Geissler</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lukowicz%2C+P">Paul Lukowicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08507v1-abstract-short" style="display: inline;"> Human Activity Recognition using time-series data from wearable sensors poses unique challenges due to complex temporal dependencies, sensor noise, placement variability, and diverse human behaviors. These factors, combined with the nontransparent nature of black-box Machine Learning models impede interpretability and hinder human comprehension of model behavior. This paper addresses these challen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08507v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08507v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08507v1-abstract-full" style="display: none;"> Human Activity Recognition using time-series data from wearable sensors poses unique challenges due to complex temporal dependencies, sensor noise, placement variability, and diverse human behaviors. These factors, combined with the nontransparent nature of black-box Machine Learning models impede interpretability and hinder human comprehension of model behavior. This paper addresses these challenges by exploring strategies to enhance interpretability through white-box approaches, which provide actionable insights into latent space dynamics and model behavior during training. By leveraging human intuition and expertise, the proposed framework improves explainability, fosters trust, and promotes transparent Human Activity Recognition systems. A key contribution is the proposal of a Human-in-the-Loop framework that enables dynamic user interaction with models, facilitating iterative refinements to enhance performance and efficiency. Additionally, we investigate the usefulness of Large Language Model as an assistance to provide users with guidance for interpreting visualizations, diagnosing issues, and optimizing workflows. Together, these contributions present a scalable and efficient framework for developing interpretable and accessible Human Activity Recognition systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08507v1-abstract-full').style.display = 'none'; document.getElementById('2412.08507v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07779">arXiv:2412.07779</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07779">pdf</a>, <a href="https://arxiv.org/format/2412.07779">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evolution of Thought: Diverse and High-Quality Reasoning via Multi-Objective Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+B">Biqing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Z">Zhouyi Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yiang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Junqi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07779v1-abstract-short" style="display: inline;"> As multi-modal large language models (MLLMs) are increasingly applied to complex reasoning tasks, the diversity and quality of reasoning paths become crucial factors affecting their performance. Although current methods aim to enhance reasoning quality through path expansion, they often neglect the diversity of reasoning paths and effective information sharing, leading to local optima and ineffici&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07779v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07779v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07779v1-abstract-full" style="display: none;"> As multi-modal large language models (MLLMs) are increasingly applied to complex reasoning tasks, the diversity and quality of reasoning paths become crucial factors affecting their performance. Although current methods aim to enhance reasoning quality through path expansion, they often neglect the diversity of reasoning paths and effective information sharing, leading to local optima and inefficiency. To address these challenges, we propose Evolution of Thought (EoT), a multi-objective framework designed to improve reasoning by fostering both high-quality and diverse reasoning paths. Specifically, we introduce the Non-dominated Sorting Genetic Algorithm II for multi-objective optimization, utilizing crossover and mutation operators to promote greater diversity in reasoning solutions. Additionally, we propose a Condensation-Aggregation mechanism to cluster and eliminate redundant paths, facilitate improved information sharing among parent nodes, and ultimately enhance both the efficiency and quality of the reasoning process. Validation experiments on various vision-language and language reasoning tasks demonstrate that EoT achieves superior reasoning performance and efficiency compared to other competitive baselines. Our study provides a novel perspective on the design of heuristic reasoning frameworks for MLLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07779v1-abstract-full').style.display = 'none'; document.getElementById('2412.07779v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03767">arXiv:2412.03767</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.03767">pdf</a>, <a href="https://arxiv.org/format/2412.03767">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Hyper: Hyperparameter Robust Efficient Exploration in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yiran Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chenshu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yunfan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Amani%2C+S">Sanae Amani</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L+F">Lin F. Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03767v1-abstract-short" style="display: inline;"> The exploration \&amp; exploitation dilemma poses significant challenges in reinforcement learning (RL). Recently, curiosity-based exploration methods achieved great success in tackling hard-exploration problems. However, they necessitate extensive hyperparameter tuning on different environments, which heavily limits the applicability and accessibility of this line of methods. In this paper, we charac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03767v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03767v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03767v1-abstract-full" style="display: none;"> The exploration \&amp; exploitation dilemma poses significant challenges in reinforcement learning (RL). Recently, curiosity-based exploration methods achieved great success in tackling hard-exploration problems. However, they necessitate extensive hyperparameter tuning on different environments, which heavily limits the applicability and accessibility of this line of methods. In this paper, we characterize this problem via analysis of the agent behavior, concluding the fundamental difficulty of choosing a proper hyperparameter. We then identify the difficulty and the instability of the optimization when the agent learns with curiosity. We propose our method, hyperparameter robust exploration (\textbf{Hyper}), which extensively mitigates the problem by effectively regularizing the visitation of the exploration and decoupling the exploitation to ensure stable training. We theoretically justify that \textbf{Hyper} is provably efficient under function approximation setting and empirically demonstrate its appealing performance and robustness in various environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03767v1-abstract-full').style.display = 'none'; document.getElementById('2412.03767v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:1907.05388 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01981">arXiv:2412.01981</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01981">pdf</a>, <a href="https://arxiv.org/format/2412.01981">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Free Process Rewards without Process Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Lifan Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wendi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+G">Ganqu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01981v1-abstract-short" style="display: inline;"> Different from its counterpart outcome reward models (ORMs), which evaluate the entire responses, a process reward model (PRM) scores a reasoning trajectory step by step, providing denser and more fine grained rewards. However, training a PRM requires labels annotated at every intermediate step, presenting significant challenges for both manual and automatic data collection. This paper aims to add&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01981v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01981v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01981v1-abstract-full" style="display: none;"> Different from its counterpart outcome reward models (ORMs), which evaluate the entire responses, a process reward model (PRM) scores a reasoning trajectory step by step, providing denser and more fine grained rewards. However, training a PRM requires labels annotated at every intermediate step, presenting significant challenges for both manual and automatic data collection. This paper aims to address this challenge. Both theoretically and empirically, we show that an \textit{implicit PRM} can be obtained at no additional cost, by simply training an ORM on the cheaper response-level labels. The only assumption is to parameterize the outcome reward as the log-likelihood ratios of the policy and reference models, which can be optimized regardless of the specific choice of loss objectives. In experiments, we instantiate our implicit PRMs with various objectives and evaluate their performance on MATH. We show that our implicit PRM outperforms a strong MCTS-based baseline \textit{谩 la} Math-Shepherd using less than $1/38$ of the training data. Its performance can be further improved with majority voting. We further find that scaling up instructions and responses benefits our implicit PRM, and the latter brings a larger gain. Particularly, we find that our implicit PRM, when instantiated with the cross-entropy (CE) loss, is more data-efficient and can keep improving generation models even when trained with only one response per instruction, the setup that suffers from extreme data scarcity and imbalance. Further, instructions should be relevant to downstream tasks while the diversity of responses does not bring gains. Surprisingly, training on extra Math-Shepherd step labels brings no further improvements to our implicit PRM trained on only outcome data. We hope that our work will encourage a rethinking of PRM training approaches and contribute to making training PRMs more accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01981v1-abstract-full').style.display = 'none'; document.getElementById('2412.01981v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Models and data are available at: https://github.com/lifan-yuan/ImplicitPRM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01812">arXiv:2412.01812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01812">pdf</a>, <a href="https://arxiv.org/format/2412.01812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> V2XPnP: Vehicle-to-Everything Spatio-Temporal Fusion for Multi-Agent Perception and Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zewei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+H">Hao Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zhaoliang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S+Z">Seth Z. Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+M">Mingyue Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+T">Tianhui Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Johnson Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Bajji%2C+M">Maheswari Bajji</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+J">Jacob Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhiyu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jiaqi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01812v1-abstract-short" style="display: inline;"> Vehicle-to-everything (V2X) technologies offer a promising paradigm to mitigate the limitations of constrained observability in single-vehicle systems. Prior work primarily focuses on single-frame cooperative perception, which fuses agents&#39; information across different spatial locations but ignores temporal cues and temporal tasks (e.g., temporal perception and prediction). In this paper, we focus&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01812v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01812v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01812v1-abstract-full" style="display: none;"> Vehicle-to-everything (V2X) technologies offer a promising paradigm to mitigate the limitations of constrained observability in single-vehicle systems. Prior work primarily focuses on single-frame cooperative perception, which fuses agents&#39; information across different spatial locations but ignores temporal cues and temporal tasks (e.g., temporal perception and prediction). In this paper, we focus on temporal perception and prediction tasks in V2X scenarios and design one-step and multi-step communication strategies (when to transmit) as well as examine their integration with three fusion strategies - early, late, and intermediate (what to transmit), providing comprehensive benchmarks with various fusion models (how to fuse). Furthermore, we propose V2XPnP, a novel intermediate fusion framework within one-step communication for end-to-end perception and prediction. Our framework employs a unified Transformer-based architecture to effectively model complex spatiotemporal relationships across temporal per-frame, spatial per-agent, and high-definition map. Moreover, we introduce the V2XPnP Sequential Dataset that supports all V2X cooperation modes and addresses the limitations of existing real-world datasets, which are restricted to single-frame or single-mode cooperation. Extensive experiments demonstrate our framework outperforms state-of-the-art methods in both perception and prediction tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01812v1-abstract-full').style.display = 'none'; document.getElementById('2412.01812v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website link: https://mobility-lab.seas.ucla.edu/v2xpnp/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00054">arXiv:2412.00054</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00054">pdf</a>, <a href="https://arxiv.org/format/2412.00054">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Efficient Model Merging with Binary Task Switch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+B">Biqing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+F">Fangyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Junqi Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+P">Peng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00054v1-abstract-short" style="display: inline;"> As an effective approach to equip models with multi-task capabilities without additional training, model merging has garnered significant attention. However, existing methods face challenges of redundant parameter conflicts and the excessive storage burden of parameters. In this work, through controlled experiments, we reveal that for task vectors, only those parameters with magnitudes above a cer&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00054v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00054v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00054v1-abstract-full" style="display: none;"> As an effective approach to equip models with multi-task capabilities without additional training, model merging has garnered significant attention. However, existing methods face challenges of redundant parameter conflicts and the excessive storage burden of parameters. In this work, through controlled experiments, we reveal that for task vectors, only those parameters with magnitudes above a certain threshold contribute positively to the task, exhibiting a pulse-like characteristic. We then attempt leveraging this characteristic to binarize the task vectors and reduce storage overhead. Further controlled experiments show that the binarized task vectors incur almost no decrease in fine-tuning and merging performance, and even exhibit stronger performance improvements as the proportion of redundant parameters increases. Based on these insights, we propose Task Switch (T-Switch), which decomposes task vectors into three components: 1) an activation switch instantiated by a binarized mask vector, 2) a polarity switch instantiated by a binarized sign vector, and 3) a scaling knob instantiated by a scalar coefficient. By storing task vectors in a binarized form, T-Switch alleviates parameter conflicts while ensuring efficient task parameter storage. Furthermore, to enable automated switch combination in T-Switch, we further introduce Auto-Switch, which enables training-free switch combination via retrieval from a small query set. Experiments indicate that our methods achieve significant performance improvements over existing baselines, requiring only 1-3% of the storage space of full-precision parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00054v1-abstract-full').style.display = 'none'; document.getElementById('2412.00054v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19679">arXiv:2411.19679</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.19679">pdf</a>, <a href="https://arxiv.org/format/2411.19679">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Lightweight and Scalable Design of Segment Routing in Broadband LEO Constellations Using Landmark-Based Skeleton Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+M">Menglan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenxin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Bin Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Benkuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yan Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+K">Kai Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19679v1-abstract-short" style="display: inline;"> Emerging Low Earth Orbit (LEO) broadband constellations hold significant potential to provide advanced Internet services due to inherent geometric features of the grid topology. However, high dynamics, unstable topology changes, and frequent route updates bring significant challenge to fast and adaptive routing policies. In addition, since computing, bandwidth, and storage resources in each LEO sa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19679v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19679v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19679v1-abstract-full" style="display: none;"> Emerging Low Earth Orbit (LEO) broadband constellations hold significant potential to provide advanced Internet services due to inherent geometric features of the grid topology. However, high dynamics, unstable topology changes, and frequent route updates bring significant challenge to fast and adaptive routing policies. In addition, since computing, bandwidth, and storage resources in each LEO satellite is strictly limited, traffic demands are typically unbalanced, further enlarging the challenge to scalable routing policies with load balancing. Nevertheless, most existing research failed to address the above difficulties. Therefore, this paper proposes a lightweight and scalable protocol of segment routing through landmark-based skeleton graphs. To improve the overall performance, we design an efficient multipath segment routing algorithm. First, the algorithm partitions the network into multiple regions to construct skeleton paths, which can effectively guide packet forwarding and reduce the operating costs. In each region, multipath probabilistic routing is used to achieve uniform traffic distribution, avoiding hotspot congestion. Furthermore, the flexible hierarchical partitioning and localized segmented routing is employed for fine-grained traffic control and QoS guarantee combined with adaptive local single-path routing. Finally, experimental results validate our method&#39;s superior performance in terms of response time and network utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19679v1-abstract-full').style.display = 'none'; document.getElementById('2411.19679v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18651">arXiv:2411.18651</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18651">pdf</a>, <a href="https://arxiv.org/format/2411.18651">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Verbalized Representation Learning for Interpretable Few-Shot Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Cheng-Fu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+W">Wenbo Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+N">Nanyun Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bolei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18651v1-abstract-short" style="display: inline;"> Humans recognize objects after observing only a few examples, a remarkable capability enabled by their inherent language understanding of the real-world environment. Developing verbalized and interpretable representation can significantly improve model generalization in low-data settings. In this work, we propose Verbalized Representation Learning (VRL), a novel approach for automatically extracti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18651v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18651v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18651v1-abstract-full" style="display: none;"> Humans recognize objects after observing only a few examples, a remarkable capability enabled by their inherent language understanding of the real-world environment. Developing verbalized and interpretable representation can significantly improve model generalization in low-data settings. In this work, we propose Verbalized Representation Learning (VRL), a novel approach for automatically extracting human-interpretable features for object recognition using few-shot data. Our method uniquely captures inter-class differences and intra-class commonalities in the form of natural language by employing a Vision-Language Model (VLM) to identify key discriminative features between different classes and shared characteristics within the same class. These verbalized features are then mapped to numeric vectors through the VLM. The resulting feature vectors can be further utilized to train and infer with downstream classifiers. Experimental results show that, at the same model scale, VRL achieves a 24% absolute improvement over prior state-of-the-art methods while using 95% less data and a smaller mode. Furthermore, compared to human-labeled attributes, the features learned by VRL exhibit a 20% absolute gain when used for downstream classification tasks. Code is available at: https://github.com/joeyy5588/VRL/tree/main. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18651v1-abstract-full').style.display = 'none'; document.getElementById('2411.18651v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15585">arXiv:2411.15585</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15585">pdf</a>, <a href="https://arxiv.org/format/2411.15585">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boosting Semi-Supervised Scene Text Recognition via Viewing and Summarizing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qu%2C+Y">Yadong Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bangbang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zixiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+H">Hongtao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yongdong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15585v1-abstract-short" style="display: inline;"> Existing scene text recognition (STR) methods struggle to recognize challenging texts, especially for artistic and severely distorted characters. The limitation lies in the insufficient exploration of character morphologies, including the monotonousness of widely used synthetic training data and the sensitivity of the model to character morphologies. To address these issues, inspired by the human&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15585v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15585v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15585v1-abstract-full" style="display: none;"> Existing scene text recognition (STR) methods struggle to recognize challenging texts, especially for artistic and severely distorted characters. The limitation lies in the insufficient exploration of character morphologies, including the monotonousness of widely used synthetic training data and the sensitivity of the model to character morphologies. To address these issues, inspired by the human learning process of viewing and summarizing, we facilitate the contrastive learning-based STR framework in a self-motivated manner by leveraging synthetic and real unlabeled data without any human cost. In the viewing process, to compensate for the simplicity of synthetic data and enrich character morphology diversity, we propose an Online Generation Strategy to generate background-free samples with diverse character styles. By excluding background noise distractions, the model is encouraged to focus on character morphology and generalize the ability to recognize complex samples when trained with only simple synthetic data. To boost the summarizing process, we theoretically demonstrate the derivation error in the previous character contrastive loss, which mistakenly causes the sparsity in the intra-class distribution and exacerbates ambiguity on challenging samples. Therefore, a new Character Unidirectional Alignment Loss is proposed to correct this error and unify the representation of the same characters in all samples by aligning the character features in the student model with the reference features in the teacher model. Extensive experiment results show that our method achieves SOTA performance (94.7\% and 70.9\% average accuracy on common benchmarks and Union14M-Benchmark). Code will be available at https://github.com/qqqyd/ViSu. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15585v1-abstract-full').style.display = 'none'; document.getElementById('2411.15585v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13859">arXiv:2411.13859</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13859">pdf</a>, <a href="https://arxiv.org/format/2411.13859">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Data-Driven Multi-step Nonlinear Model Predictive Control for Industrial Heavy Load Hydraulic Robot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+D">Dexian Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13859v1-abstract-short" style="display: inline;"> Automating complex industrial robots requires precise nonlinear control and efficient energy management. This paper introduces a data-driven nonlinear model predictive control (NMPC) framework to optimize control under multiple objectives. To enhance the prediction accuracy of the dynamic model, we design a single-shot multi-step prediction (SSMP) model based on long short-term memory (LSTM) and m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13859v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13859v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13859v1-abstract-full" style="display: none;"> Automating complex industrial robots requires precise nonlinear control and efficient energy management. This paper introduces a data-driven nonlinear model predictive control (NMPC) framework to optimize control under multiple objectives. To enhance the prediction accuracy of the dynamic model, we design a single-shot multi-step prediction (SSMP) model based on long short-term memory (LSTM) and multilayer perceptrons (MLP), which can directly obtain the predictive horizon without iterative repetition and reduce computational pressure. Moreover, we combine offline and online models to address disturbances stemming from environmental interactions, similar to the superposition of the robot&#39;s free and forced responses. The online model learns the system&#39;s variations from the prediction mismatches of the offline model and updates its weights in real time. The proposed hybrid predictive model simplifies the relationship between inputs and outputs into matrix multiplication, which can quickly obtain the derivative. Therefore, the solution for the control signal sequence employs a gradient descent method with an adaptive learning rate, allowing the NMPC cost function to be formulated as a convex function incorporating critical states. The learning rate is dynamically adjusted based on state errors to counteract the inherent prediction inaccuracies of neural networks. The controller outputs the average value of the control signal sequence instead of the first value. Simulations and experiments on a 22-ton hydraulic excavator have validated the effectiveness of our method, showing that the proposed NMPC approach can be widely applied to industrial systems, including nonlinear control and energy management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13859v1-abstract-full').style.display = 'none'; document.getElementById('2411.13859v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13856">arXiv:2411.13856</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13856">pdf</a>, <a href="https://arxiv.org/format/2411.13856">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Data-Driven Modeling and Motion Control of Heavy-Load Hydraulic Manipulators via Reversible Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+D">Dexian Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yirong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenbo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13856v1-abstract-short" style="display: inline;"> This work proposes a data-driven modeling and the corresponding hybrid motion control framework for unmanned and automated operation of industrial heavy-load hydraulic manipulator. Rather than the direct use of a neural network black box, we construct a reversible nonlinear model by using multilayer perceptron to approximate dynamics in the physical integrator chain system after reversible transfo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13856v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13856v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13856v1-abstract-full" style="display: none;"> This work proposes a data-driven modeling and the corresponding hybrid motion control framework for unmanned and automated operation of industrial heavy-load hydraulic manipulator. Rather than the direct use of a neural network black box, we construct a reversible nonlinear model by using multilayer perceptron to approximate dynamics in the physical integrator chain system after reversible transformations. The reversible nonlinear model is trained offline using supervised learning techniques, and the data are obtained from simulations or experiments. Entire hybrid motion control framework consists of the model inversion controller that compensates for the nonlinear dynamics and proportional-derivative controller that enhances the robustness. The stability is proved with Lyapunov theory. Co-simulation and Experiments show the effectiveness of proposed modeling and hybrid control framework. With a commercial 39-ton class hydraulic excavator for motion control tasks, the root mean square error of trajectory tracking error decreases by at least 50\% compared to traditional control methods. In addition, by analyzing the system model, the proposed framework can be rapidly applied to different control plants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13856v1-abstract-full').style.display = 'none'; document.getElementById('2411.13856v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11363">arXiv:2411.11363</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11363">pdf</a>, <a href="https://arxiv.org/format/2411.11363">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GPS-Gaussian+: Generalizable Pixel-wise 3D Gaussian Splatting for Real-Time Human-Scene Rendering from Sparse Views </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Boyao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+S">Shunyuan Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+H">Hanzhang Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+R">Ruizhi Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Boning Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shengping Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yebin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11363v1-abstract-short" style="display: inline;"> Differentiable rendering techniques have recently shown promising results for free-viewpoint video synthesis of characters. However, such methods, either Gaussian Splatting or neural implicit rendering, typically necessitate per-subject optimization which does not meet the requirement of real-time rendering in an interactive application. We propose a generalizable Gaussian Splatting approach for h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11363v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11363v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11363v1-abstract-full" style="display: none;"> Differentiable rendering techniques have recently shown promising results for free-viewpoint video synthesis of characters. However, such methods, either Gaussian Splatting or neural implicit rendering, typically necessitate per-subject optimization which does not meet the requirement of real-time rendering in an interactive application. We propose a generalizable Gaussian Splatting approach for high-resolution image rendering under a sparse-view camera setting. To this end, we introduce Gaussian parameter maps defined on the source views and directly regress Gaussian properties for instant novel view synthesis without any fine-tuning or optimization. We train our Gaussian parameter regression module on human-only data or human-scene data, jointly with a depth estimation module to lift 2D parameter maps to 3D space. The proposed framework is fully differentiable with both depth and rendering supervision or with only rendering supervision. We further introduce a regularization term and an epipolar attention mechanism to preserve geometry consistency between two source views, especially when neglecting depth supervision. Experiments on several datasets demonstrate that our method outperforms state-of-the-art methods while achieving an exceeding rendering speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11363v1-abstract-full').style.display = 'none'; document.getElementById('2411.11363v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal extension of CVPR 2024,Project page:https://yaourtb.github.io/GPS-Gaussian+</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07123">arXiv:2411.07123</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07123">pdf</a>, <a href="https://arxiv.org/format/2411.07123">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fast and Robust Contextual Node Representation Learning over Dynamic Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+X">Xingzhi Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Silong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Baojian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yanghua Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Skiena%2C+S">Steven Skiena</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07123v1-abstract-short" style="display: inline;"> Real-world graphs grow rapidly with edge and vertex insertions over time, motivating the problem of efficiently maintaining robust node representation over evolving graphs. Recent efficient GNNs are designed to decouple recursive message passing from the learning process, and favor Personalized PageRank (PPR) as the underlying feature propagation mechanism. However, most PPR-based GNNs are designe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07123v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07123v1-abstract-full" style="display: none;"> Real-world graphs grow rapidly with edge and vertex insertions over time, motivating the problem of efficiently maintaining robust node representation over evolving graphs. Recent efficient GNNs are designed to decouple recursive message passing from the learning process, and favor Personalized PageRank (PPR) as the underlying feature propagation mechanism. However, most PPR-based GNNs are designed for static graphs, and efficient PPR maintenance remains as an open problem. Further, there is surprisingly little theoretical justification for the choice of PPR, despite its impressive empirical performance. In this paper, we are inspired by the recent PPR formulation as an explicit $\ell_1$-regularized optimization problem and propose a unified dynamic graph learning framework based on sparse node-wise attention. We also present a set of desired properties to justify the choice of PPR in STOA GNNs, and serves as the guideline for future node attention designs. Meanwhile, we take advantage of the PPR-equivalent optimization formulation and employ the proximal gradient method (ISTA) to improve the efficiency of PPR-based GNNs upto 6 times. Finally, we instantiate a simple-yet-effective model (\textsc{GoPPE}) with robust positional encodings by maximizing PPR previously used as attention. The model performs comparably to or better than the STOA baselines and greatly outperforms when the initial node attributes are noisy during graph evolution, demonstrating the effectiveness and robustness of \textsc{GoPPE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07123v1-abstract-full').style.display = 'none'; document.getElementById('2411.07123v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03743">arXiv:2411.03743</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03743">pdf</a>, <a href="https://arxiv.org/format/2411.03743">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Automating Exploratory Proteomics Research via Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+S">Shang Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Linhai Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zaoqu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yibai Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+Y">Yuxin Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhangren Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Youbang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+F">Fuchu He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03743v1-abstract-short" style="display: inline;"> With the development of artificial intelligence, its contribution to science is evolving from simulating a complex problem to automating entire research processes and producing novel discoveries. Achieving this advancement requires both specialized general models grounded in real-world scientific data and iterative, exploratory frameworks that mirror human scientific methodologies. In this paper,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03743v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03743v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03743v1-abstract-full" style="display: none;"> With the development of artificial intelligence, its contribution to science is evolving from simulating a complex problem to automating entire research processes and producing novel discoveries. Achieving this advancement requires both specialized general models grounded in real-world scientific data and iterative, exploratory frameworks that mirror human scientific methodologies. In this paper, we present PROTEUS, a fully automated system for scientific discovery from raw proteomics data. PROTEUS uses large language models (LLMs) to perform hierarchical planning, execute specialized bioinformatics tools, and iteratively refine analysis workflows to generate high-quality scientific hypotheses. The system takes proteomics datasets as input and produces a comprehensive set of research objectives, analysis results, and novel biological hypotheses without human intervention. We evaluated PROTEUS on 12 proteomics datasets collected from various biological samples (e.g. immune cells, tumors) and different sample types (single-cell and bulk), generating 191 scientific hypotheses. These were assessed using both automatic LLM-based scoring on 5 metrics and detailed reviews from human experts. Results demonstrate that PROTEUS consistently produces reliable, logically coherent results that align well with existing literature while also proposing novel, evaluable hypotheses. The system&#39;s flexible architecture facilitates seamless integration of diverse analysis tools and adaptation to different proteomics data types. By automating complex proteomics analysis workflows and hypothesis generation, PROTEUS has the potential to considerably accelerate the pace of scientific discovery in proteomics research, enabling researchers to efficiently explore large-scale datasets and uncover biological insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03743v1-abstract-full').style.display = 'none'; document.getElementById('2411.03743v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03169">arXiv:2411.03169</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03169">pdf</a>, <a href="https://arxiv.org/format/2411.03169">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pre-trained Visual Dynamics Representations for Efficient Policy Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Hao Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bohan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zongqing Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03169v1-abstract-short" style="display: inline;"> Pre-training for Reinforcement Learning (RL) with purely video data is a valuable yet challenging problem. Although in-the-wild videos are readily available and inhere a vast amount of prior world knowledge, the absence of action annotations and the common domain gap with downstream tasks hinder utilizing videos for RL pre-training. To address the challenge of pre-training with videos, we propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03169v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03169v1-abstract-full" style="display: none;"> Pre-training for Reinforcement Learning (RL) with purely video data is a valuable yet challenging problem. Although in-the-wild videos are readily available and inhere a vast amount of prior world knowledge, the absence of action annotations and the common domain gap with downstream tasks hinder utilizing videos for RL pre-training. To address the challenge of pre-training with videos, we propose Pre-trained Visual Dynamics Representations (PVDR) to bridge the domain gap between videos and downstream tasks for efficient policy learning. By adopting video prediction as a pre-training task, we use a Transformer-based Conditional Variational Autoencoder (CVAE) to learn visual dynamics representations. The pre-trained visual dynamics representations capture the visual dynamics prior knowledge in the videos. This abstract prior knowledge can be readily adapted to downstream tasks and aligned with executable actions through online adaptation. We conduct experiments on a series of robotics visual control tasks and verify that PVDR is an effective form for pre-training with videos to promote policy learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03169v1-abstract-full').style.display = 'none'; document.getElementById('2411.03169v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02063">arXiv:2411.02063</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02063">pdf</a>, <a href="https://arxiv.org/format/2411.02063">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scalable Efficient Training of Large Language Models with Low-dimensional Projected Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+G">Ganqu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02063v1-abstract-short" style="display: inline;"> Improving the effectiveness and efficiency of large language models (LLMs) simultaneously is a critical yet challenging research goal. In this paper, we find that low-rank pre-training, normally considered as efficient methods that will compromise performance, can be scalably effective when reduced parameters are precisely targeted. Specifically, applying the low-dimensional module only to the att&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02063v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02063v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02063v1-abstract-full" style="display: none;"> Improving the effectiveness and efficiency of large language models (LLMs) simultaneously is a critical yet challenging research goal. In this paper, we find that low-rank pre-training, normally considered as efficient methods that will compromise performance, can be scalably effective when reduced parameters are precisely targeted. Specifically, applying the low-dimensional module only to the attention layer -- resolves this issue and enhances both effectiveness and efficiency. We refer to this structure as Low-dimensional Projected Attention (LPA) and provide an explanatory analysis. Through extensive experimentation at parameter scales of 130M, 370M, and scaling up to 3B, we have validated the effectiveness and scalability of LPA. Our results show that LPA model can save up to 12.4% in time while achieving an approximate 5% improvement in test perplexity (ppl) and on downstream tasks compared with the vanilla Transformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02063v1-abstract-full').style.display = 'none'; document.getElementById('2411.02063v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 (Main Conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23919">arXiv:2410.23919</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23919">pdf</a>, <a href="https://arxiv.org/format/2410.23919">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Angle Map-based Beam Alignment for RIS-aided mmWave Communication Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xia%2C+H">Hao Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+Q">Qing Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yanping Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Binggui Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+M">Meng Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qianbin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23919v1-abstract-short" style="display: inline;"> Recently, reconfigurable intelligent surface (RIS) has been widely used to enhance the performance of millimeter wave (mmWave) communication systems, making beam alignment more challenging. To ensure efficient communication, this paper proposes a novel intelligent angle map-based beam alignment scheme for both general user equipments (UEs) and RIS-aided UEs simultaneously in a fast and effective w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23919v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23919v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23919v1-abstract-full" style="display: none;"> Recently, reconfigurable intelligent surface (RIS) has been widely used to enhance the performance of millimeter wave (mmWave) communication systems, making beam alignment more challenging. To ensure efficient communication, this paper proposes a novel intelligent angle map-based beam alignment scheme for both general user equipments (UEs) and RIS-aided UEs simultaneously in a fast and effective way. Specifically, we construct a beam alignment architecture that utilizes only angular information. To obtain the angle information, the currently hottest seq2seq model - the Transformer - is introduced to offline learn the relationship between UE geographic location and the corresponding optimal beam direction. Based on the powerful machine learning model, the location-angle mapping function, i.e., the angle map, can be built. As long as the location information of UEs is available, the angle map can make the acquisition of beam alignment angles effortless. In the simulation, we utilize a ray-tracing-based dataset to verify the performance of the proposed scheme. It is demonstrated that the proposed scheme can achieve high-precision beam alignment and remarkable system performance without any beam scanning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23919v1-abstract-full').style.display = 'none'; document.getElementById('2410.23919v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21634">arXiv:2410.21634</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21634">pdf</a>, <a href="https://arxiv.org/format/2410.21634">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Faster Local Solvers for Graph Diffusion Equations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bai%2C+J">Jiahe Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Baojian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Deqing Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yanghua Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21634v2-abstract-short" style="display: inline;"> Efficient computation of graph diffusion equations (GDEs), such as Personalized PageRank, Katz centrality, and the Heat kernel, is crucial for clustering, training neural networks, and many other graph-related problems. Standard iterative methods require accessing the whole graph per iteration, making them time-consuming for large-scale graphs. While existing local solvers approximate diffusion ve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21634v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21634v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21634v2-abstract-full" style="display: none;"> Efficient computation of graph diffusion equations (GDEs), such as Personalized PageRank, Katz centrality, and the Heat kernel, is crucial for clustering, training neural networks, and many other graph-related problems. Standard iterative methods require accessing the whole graph per iteration, making them time-consuming for large-scale graphs. While existing local solvers approximate diffusion vectors through heuristic local updates, they often operate sequentially and are typically designed for specific diffusion types, limiting their applicability. Given that diffusion vectors are highly localizable, as measured by the participation ratio, this paper introduces a novel framework for approximately solving GDEs using a local diffusion process. This framework reveals the suboptimality of existing local solvers. Furthermore, our approach effectively localizes standard iterative solvers by designing simple and provably sublinear time algorithms. These new local solvers are highly parallelizable, making them well-suited for implementation on GPUs. We demonstrate the effectiveness of our framework in quickly obtaining approximate diffusion vectors, achieving up to a hundred-fold speed improvement, and its applicability to large-scale dynamic graphs. Our framework could also facilitate more efficient local message-passing mechanisms for GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21634v2-abstract-full').style.display = 'none'; document.getElementById('2410.21634v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Zhou%2C+B&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10