Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,183 results for author: <span class="mathjax">Xu, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xu%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21144">arXiv:2503.21144</a> <span> [<a href="https://arxiv.org/pdf/2503.21144">pdf</a>, <a href="https://arxiv.org/format/2503.21144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ChatAnyone: Stylized Real-time Portrait Video Generation with Hierarchical Motion Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jinwei Qi</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+C">Chaonan Ji</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sheng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21144v1-abstract-short" style="display: inline;"> Real-time interactive video-chat portraits have been increasingly recognized as the future trend, particularly due to the remarkable progress made in text and voice chat technologies. However, existing methods primarily focus on real-time generation of head movements, but struggle to produce synchronized body motions that match these head actions. Additionally, achieving fine-grained control over… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21144v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21144v1-abstract-full" style="display: none;"> Real-time interactive video-chat portraits have been increasingly recognized as the future trend, particularly due to the remarkable progress made in text and voice chat technologies. However, existing methods primarily focus on real-time generation of head movements, but struggle to produce synchronized body motions that match these head actions. Additionally, achieving fine-grained control over the speaking style and nuances of facial expressions remains a challenge. To address these limitations, we introduce a novel framework for stylized real-time portrait video generation, enabling expressive and flexible video chat that extends from talking head to upper-body interaction. Our approach consists of the following two stages. The first stage involves efficient hierarchical motion diffusion models, that take both explicit and implicit motion representations into account based on audio inputs, which can generate a diverse range of facial expressions with stylistic control and synchronization between head and body movements. The second stage aims to generate portrait video featuring upper-body movements, including hand gestures. We inject explicit hand control signals into the generator to produce more detailed hand movements, and further perform face refinement to enhance the overall realism and expressiveness of the portrait video. Additionally, our approach supports efficient and continuous generation of upper-body portrait video in maximum 512 * 768 resolution at up to 30fps on 4090 GPU, supporting interactive video-chat in real-time. Experimental results demonstrate the capability of our approach to produce portrait videos with rich expressiveness and natural upper-body movements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21144v1-abstract-full').style.display = 'none'; document.getElementById('2503.21144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://humanaigc.github.io/chat-anyone/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20768">arXiv:2503.20768</a> <span> [<a href="https://arxiv.org/pdf/2503.20768">pdf</a>, <a href="https://arxiv.org/format/2503.20768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of the Impact of Federated Learning on Machine Learning Model Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haotian Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoran Wang</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+B">Benson Chou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sophie Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingxian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qizhen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20768v2-abstract-short" style="display: inline;"> Federated Learning (FL) enables distributed ML model training on private user data at the global scale. Despite the potential of FL demonstrated in many domains, an in-depth view of its impact on model accuracy remains unclear. In this paper, we investigate, systematically, how this learning paradigm can affect the accuracy of state-of-the-art ML models for a variety of ML tasks. We present an emp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20768v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20768v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20768v2-abstract-full" style="display: none;"> Federated Learning (FL) enables distributed ML model training on private user data at the global scale. Despite the potential of FL demonstrated in many domains, an in-depth view of its impact on model accuracy remains unclear. In this paper, we investigate, systematically, how this learning paradigm can affect the accuracy of state-of-the-art ML models for a variety of ML tasks. We present an empirical study that involves various data types: text, image, audio, and video, and FL configuration knobs: data distribution, FL scale, client sampling, and local and global computations. Our experiments are conducted in a unified FL framework to achieve high fidelity, with substantial human efforts and resource investments. Based on the results, we perform a quantitative analysis of the impact of FL, and highlight challenging scenarios where applying FL degrades the accuracy of the model drastically and identify cases where the impact is negligible. The detailed and extensive findings can benefit practical deployments and future development of FL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20768v2-abstract-full').style.display = 'none'; document.getElementById('2503.20768v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.2.4; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20324">arXiv:2503.20324</a> <span> [<a href="https://arxiv.org/pdf/2503.20324">pdf</a>, <a href="https://arxiv.org/format/2503.20324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> CTS-CBS: A New Approach for Multi-Agent Collaborative Task Sequencing and Path Finding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junkai Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruochen Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibin Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuning Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaobing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20324v1-abstract-short" style="display: inline;"> This paper addresses a generalization problem of Multi-Agent Pathfinding (MAPF), called Collaborative Task Sequencing - Multi-Agent Pathfinding (CTS-MAPF), where agents must plan collision-free paths and visit a series of intermediate task locations in a specific order before reaching their final destinations. To address this problem, we propose a new approach, Collaborative Task Sequencing - Conf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20324v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20324v1-abstract-full" style="display: none;"> This paper addresses a generalization problem of Multi-Agent Pathfinding (MAPF), called Collaborative Task Sequencing - Multi-Agent Pathfinding (CTS-MAPF), where agents must plan collision-free paths and visit a series of intermediate task locations in a specific order before reaching their final destinations. To address this problem, we propose a new approach, Collaborative Task Sequencing - Conflict-Based Search (CTS-CBS), which conducts a two-level search. In the high level, it generates a search forest, where each tree corresponds to a joint task sequence derived from the jTSP solution. In the low level, CTS-CBS performs constrained single-agent path planning to generate paths for each agent while adhering to high-level constraints. We also provide heoretical guarantees of its completeness and optimality (or sub-optimality with a bounded parameter). To evaluate the performance of CTS-CBS, we create two datasets, CTS-MAPF and MG-MAPF, and conduct comprehensive experiments. The results show that CTS-CBS adaptations for MG-MAPF outperform baseline algorithms in terms of success rate (up to 20 times larger) and runtime (up to 100 times faster), with less than a 10% sacrifice in solution quality. Furthermore, CTS-CBS offers flexibility by allowing users to adjust the sub-optimality bound omega to balance between solution quality and efficiency. Finally, practical robot tests demonstrate the algorithm's applicability in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20324v1-abstract-full').style.display = 'none'; document.getElementById('2503.20324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20020">arXiv:2503.20020</a> <span> [<a href="https://arxiv.org/pdf/2503.20020">pdf</a>, <a href="https://arxiv.org/format/2503.20020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Gemini Robotics: Bringing AI into the Physical World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Robotics+Team"> Gemini Robotics Team</a>, <a href="/search/cs?searchtype=author&query=Abeyruwan%2C+S">Saminda Abeyruwan</a>, <a href="/search/cs?searchtype=author&query=Ainslie%2C+J">Joshua Ainslie</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Arenas%2C+M+G">Montserrat Gonzalez Arenas</a>, <a href="/search/cs?searchtype=author&query=Armstrong%2C+T">Travis Armstrong</a>, <a href="/search/cs?searchtype=author&query=Balakrishna%2C+A">Ashwin Balakrishna</a>, <a href="/search/cs?searchtype=author&query=Baruch%2C+R">Robert Baruch</a>, <a href="/search/cs?searchtype=author&query=Bauza%2C+M">Maria Bauza</a>, <a href="/search/cs?searchtype=author&query=Blokzijl%2C+M">Michiel Blokzijl</a>, <a href="/search/cs?searchtype=author&query=Bohez%2C+S">Steven Bohez</a>, <a href="/search/cs?searchtype=author&query=Bousmalis%2C+K">Konstantinos Bousmalis</a>, <a href="/search/cs?searchtype=author&query=Brohan%2C+A">Anthony Brohan</a>, <a href="/search/cs?searchtype=author&query=Buschmann%2C+T">Thomas Buschmann</a>, <a href="/search/cs?searchtype=author&query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&query=Cabi%2C+S">Serkan Cabi</a>, <a href="/search/cs?searchtype=author&query=Caluwaerts%2C+K">Ken Caluwaerts</a>, <a href="/search/cs?searchtype=author&query=Casarini%2C+F">Federico Casarini</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+O">Oscar Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+E">Jose Enrique Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+H+L">Hao-Tien Lewis Chiang</a>, <a href="/search/cs?searchtype=author&query=Choromanski%2C+K">Krzysztof Choromanski</a>, <a href="/search/cs?searchtype=author&query=D%27Ambrosio%2C+D">David D'Ambrosio</a>, <a href="/search/cs?searchtype=author&query=Dasari%2C+S">Sudeep Dasari</a> , et al. (93 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20020v1-abstract-short" style="display: inline;"> Recent advancements in large multimodal models have led to the emergence of remarkable generalist capabilities in digital domains, yet their translation to physical agents such as robots remains a significant challenge. This report introduces a new family of AI models purposefully designed for robotics and built upon the foundation of Gemini 2.0. We present Gemini Robotics, an advanced Vision-Lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20020v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20020v1-abstract-full" style="display: none;"> Recent advancements in large multimodal models have led to the emergence of remarkable generalist capabilities in digital domains, yet their translation to physical agents such as robots remains a significant challenge. This report introduces a new family of AI models purposefully designed for robotics and built upon the foundation of Gemini 2.0. We present Gemini Robotics, an advanced Vision-Language-Action (VLA) generalist model capable of directly controlling robots. Gemini Robotics executes smooth and reactive movements to tackle a wide range of complex manipulation tasks while also being robust to variations in object types and positions, handling unseen environments as well as following diverse, open vocabulary instructions. We show that with additional fine-tuning, Gemini Robotics can be specialized to new capabilities including solving long-horizon, highly dexterous tasks, learning new short-horizon tasks from as few as 100 demonstrations and adapting to completely novel robot embodiments. This is made possible because Gemini Robotics builds on top of the Gemini Robotics-ER model, the second model we introduce in this work. Gemini Robotics-ER (Embodied Reasoning) extends Gemini's multimodal reasoning capabilities into the physical world, with enhanced spatial and temporal understanding. This enables capabilities relevant to robotics including object detection, pointing, trajectory and grasp prediction, as well as multi-view correspondence and 3D bounding box predictions. We show how this novel combination can support a variety of robotics applications. We also discuss and address important safety considerations related to this new class of robotics foundation models. The Gemini Robotics family marks a substantial step towards developing general-purpose robots that realizes AI's potential in the physical world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20020v1-abstract-full').style.display = 'none'; document.getElementById('2503.20020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19769">arXiv:2503.19769</a> <span> [<a href="https://arxiv.org/pdf/2503.19769">pdf</a>, <a href="https://arxiv.org/format/2503.19769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BiPrompt-SAM: Enhancing Image Segmentation via Explicit Selection between Point and Text Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Suzhe Xu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jialin Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengyuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19769v1-abstract-short" style="display: inline;"> Segmentation is a fundamental task in computer vision, with prompt-driven methods gaining prominence due to their flexibility. The recent Segment Anything Model (SAM) has demonstrated powerful point-prompt segmentation capabilities, while text-based segmentation models offer rich semantic understanding. However, existing approaches rarely explore how to effectively combine these complementary moda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19769v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19769v1-abstract-full" style="display: none;"> Segmentation is a fundamental task in computer vision, with prompt-driven methods gaining prominence due to their flexibility. The recent Segment Anything Model (SAM) has demonstrated powerful point-prompt segmentation capabilities, while text-based segmentation models offer rich semantic understanding. However, existing approaches rarely explore how to effectively combine these complementary modalities for optimal segmentation performance. This paper presents BiPrompt-SAM, a novel dual-modal prompt segmentation framework that fuses the advantages of point and text prompts through an explicit selection mechanism. Specifically, we leverage SAM's inherent ability to generate multiple mask candidates, combined with a semantic guidance mask from text prompts, and explicitly select the most suitable candidate based on similarity metrics. This approach can be viewed as a simplified Mixture of Experts (MoE) system, where the point and text modules act as distinct "experts," and the similarity scoring serves as a rudimentary "gating network." We conducted extensive evaluations on both the Endovis17 medical dataset and RefCOCO series natural image datasets. On Endovis17, BiPrompt-SAM achieved 89.55\% mDice and 81.46\% mIoU, comparable to state-of-the-art specialized medical segmentation models. On the RefCOCO series datasets, our method attained 87.1\%, 86.5\%, and 85.8\% IoU, significantly outperforming existing approaches. Experiments demonstrate that our explicit dual-selection method effectively combines the spatial precision of point prompts with the semantic richness of text prompts, particularly excelling in scenarios involving semantically complex objects, multiple similar objects, and partial occlusions. BiPrompt-SAM not only provides a simple yet effective implementation but also offers a new perspective on multi-modal prompt fusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19769v1-abstract-full').style.display = 'none'; document.getElementById('2503.19769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18802">arXiv:2503.18802</a> <span> [<a href="https://arxiv.org/pdf/2503.18802">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.5334/tismir.194">10.5334/tismir.194 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Monan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shenyang Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaorui Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaowen Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Baoqiang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18802v1-abstract-short" style="display: inline;"> Data are crucial in various computer-related fields, including music information retrieval (MIR), an interdisciplinary area bridging computer science and music. This paper introduces CCMusic, an open and diverse database comprising multiple datasets specifically designed for tasks related to Chinese music, highlighting our focus on this culturally rich domain. The database integrates both publishe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18802v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18802v1-abstract-full" style="display: none;"> Data are crucial in various computer-related fields, including music information retrieval (MIR), an interdisciplinary area bridging computer science and music. This paper introduces CCMusic, an open and diverse database comprising multiple datasets specifically designed for tasks related to Chinese music, highlighting our focus on this culturally rich domain. The database integrates both published and unpublished datasets, with steps taken such as data cleaning, label refinement, and data structure unification to ensure data consistency and create ready-to-use versions. We conduct benchmark evaluations for all datasets using a unified evaluation framework developed specifically for this purpose. This publicly available framework supports both classification and detection tasks, ensuring standardized and reproducible results across all datasets. The database is hosted on HuggingFace and ModelScope, two open and multifunctional data and model hosting platforms, ensuring ease of accessibility and usability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18802v1-abstract-full').style.display = 'none'; document.getElementById('2503.18802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 18 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Transactions of the International Society for Music Information Retrieval, 2025, 8(1), 22-38 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16768">arXiv:2503.16768</a> <span> [<a href="https://arxiv.org/pdf/2503.16768">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Attention Mechanism in Spatiotemporal Memory Networks for Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Meng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiadong Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingsheng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16768v1-abstract-short" style="display: inline;"> Mainstream visual object tracking frameworks predominantly rely on template matching paradigms. Their performance heavily depends on the quality of template features, which becomes increasingly challenging to maintain in complex scenarios involving target deformation, occlusion, and background clutter. While existing spatiotemporal memory-based trackers emphasize memory capacity expansion, they la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16768v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16768v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16768v1-abstract-full" style="display: none;"> Mainstream visual object tracking frameworks predominantly rely on template matching paradigms. Their performance heavily depends on the quality of template features, which becomes increasingly challenging to maintain in complex scenarios involving target deformation, occlusion, and background clutter. While existing spatiotemporal memory-based trackers emphasize memory capacity expansion, they lack effective mechanisms for dynamic feature selection and adaptive fusion. To address this gap, we propose a Dynamic Attention Mechanism in Spatiotemporal Memory Network (DASTM) with two key innovations: 1) A differentiable dynamic attention mechanism that adaptively adjusts channel-spatial attention weights by analyzing spatiotemporal correlations between the templates and memory features; 2) A lightweight gating network that autonomously allocates computational resources based on target motion states, prioritizing high-discriminability features in challenging scenarios. Extensive evaluations on OTB-2015, VOT 2018, LaSOT, and GOT-10K benchmarks demonstrate our DASTM's superiority, achieving state-of-the-art performance in success rate, robustness, and real-time efficiency, thereby offering a novel solution for real-time tracking in complex environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16768v1-abstract-full').style.display = 'none'; document.getElementById('2503.16768v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16252">arXiv:2503.16252</a> <span> [<a href="https://arxiv.org/pdf/2503.16252">pdf</a>, <a href="https://arxiv.org/format/2503.16252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fin-R1: A Large Language Model for Financial Reasoning through Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaowei Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xin Guo</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+F">Fangqi Lou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Lingfeng Zeng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jinyi Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiajie Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weige Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziwei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xueqian Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sheng Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dezhi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yun Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zuo Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liwen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16252v2-abstract-short" style="display: inline;"> Reasoning large language models are rapidly evolving across various domains. However, their capabilities in handling complex financial tasks still require in-depth exploration. In this paper, we introduce Fin-R1, a reasoning large language model specifically designed for the financial sector. Fin-R1 is built using a two-stage architecture, leveraging a financial reasoning dataset distilled and pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16252v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16252v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16252v2-abstract-full" style="display: none;"> Reasoning large language models are rapidly evolving across various domains. However, their capabilities in handling complex financial tasks still require in-depth exploration. In this paper, we introduce Fin-R1, a reasoning large language model specifically designed for the financial sector. Fin-R1 is built using a two-stage architecture, leveraging a financial reasoning dataset distilled and processed based on DeepSeek-R1. Through supervised fine-tuning (SFT) and reinforcement learning (RL) training, it demonstrates performance close to DeepSeek-R1 with a parameter size of 7 billion across a range of financial reasoning tasks. It achieves the state-of-the-art (SOTA) in the FinQA and ConvFinQA tasks between those LLMs in our evaluation, surpassing larger models in other tasks as well. Fin-R1 showcases strong reasoning and decision-making capabilities, providing solutions to various problems encountered in the financial domain. Our code is available at https://github.com/SUFE-AIFLM-Lab/Fin-R1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16252v2-abstract-full').style.display = 'none'; document.getElementById('2503.16252v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14492">arXiv:2503.14492</a> <span> [<a href="https://arxiv.org/pdf/2503.14492">pdf</a>, <a href="https://arxiv.org/format/2503.14492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cosmos-Transfer1: Conditional World Generation with Adaptive Multimodal Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Alhaija%2C+H+A">Hassan Abu Alhaija</a>, <a href="/search/cs?searchtype=author&query=Alvarez%2C+J">Jose Alvarez</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tiffany Cai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Tianshi Cao</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+L">Liz Cha</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Joshua Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mike Chen</a>, <a href="/search/cs?searchtype=author&query=Ferroni%2C+F">Francesco Ferroni</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&query=Hassani%2C+A">Ali Hassani</a>, <a href="/search/cs?searchtype=author&query=Isaev%2C+M">Michael Isaev</a>, <a href="/search/cs?searchtype=author&query=Jannaty%2C+P">Pooya Jannaty</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+S">Shiyi Lan</a>, <a href="/search/cs?searchtype=author&query=Lasser%2C+T">Tobias Lasser</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+H">Huan Ling</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming-Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xian Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yifan Lu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Alice Luo</a> , et al. (15 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14492v1-abstract-short" style="display: inline;"> We introduce Cosmos-Transfer, a conditional world generation model that can generate world simulations based on multiple spatial control inputs of various modalities such as segmentation, depth, and edge. In the design, the spatial conditional scheme is adaptive and customizable. It allows weighting different conditional inputs differently at different spatial locations. This enables highly contro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14492v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14492v1-abstract-full" style="display: none;"> We introduce Cosmos-Transfer, a conditional world generation model that can generate world simulations based on multiple spatial control inputs of various modalities such as segmentation, depth, and edge. In the design, the spatial conditional scheme is adaptive and customizable. It allows weighting different conditional inputs differently at different spatial locations. This enables highly controllable world generation and finds use in various world-to-world transfer use cases, including Sim2Real. We conduct extensive evaluations to analyze the proposed model and demonstrate its applications for Physical AI, including robotics Sim2Real and autonomous vehicle data enrichment. We further demonstrate an inference scaling strategy to achieve real-time world generation with an NVIDIA GB200 NVL72 rack. To help accelerate research development in the field, we open-source our models and code at https://github.com/nvidia-cosmos/cosmos-transfer1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14492v1-abstract-full').style.display = 'none'; document.getElementById('2503.14492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13252">arXiv:2503.13252</a> <span> [<a href="https://arxiv.org/pdf/2503.13252">pdf</a>, <a href="https://arxiv.org/format/2503.13252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Digital Beamforming Enhanced Radar Odometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jingqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shida Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiyuan Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13252v1-abstract-short" style="display: inline;"> Radar has become an essential sensor for autonomous navigation, especially in challenging environments where camera and LiDAR sensors fail. 4D single-chip millimeter-wave radar systems, in particular, have drawn increasing attention thanks to their ability to provide spatial and Doppler information with low hardware cost and power consumption. However, most single-chip radar systems using traditio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13252v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13252v1-abstract-full" style="display: none;"> Radar has become an essential sensor for autonomous navigation, especially in challenging environments where camera and LiDAR sensors fail. 4D single-chip millimeter-wave radar systems, in particular, have drawn increasing attention thanks to their ability to provide spatial and Doppler information with low hardware cost and power consumption. However, most single-chip radar systems using traditional signal processing, such as Fast Fourier Transform, suffer from limited spatial resolution in radar detection, significantly limiting the performance of radar-based odometry and Simultaneous Localization and Mapping (SLAM) systems. In this paper, we develop a novel radar signal processing pipeline that integrates spatial domain beamforming techniques, and extend it to 3D Direction of Arrival estimation. Experiments using public datasets are conducted to evaluate and compare the performance of our proposed signal processing pipeline against traditional methodologies. These tests specifically focus on assessing structural precision across diverse scenes and measuring odometry accuracy in different radar odometry systems. This research demonstrates the feasibility of achieving more accurate radar odometry by simply replacing the standard FFT-based processing with the proposed pipeline. The codes are available at GitHub*. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13252v1-abstract-full').style.display = 'none'; document.getElementById('2503.13252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12946">arXiv:2503.12946</a> <span> [<a href="https://arxiv.org/pdf/2503.12946">pdf</a>, <a href="https://arxiv.org/format/2503.12946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open3DBench: Open-Source Benchmark for 3D-IC Backend Implementation and PPA Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunqi Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chengrui Gao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Wanqi Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Siyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+K">Ke Xue</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mingxuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chao Qian</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhi-Hua Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12946v1-abstract-short" style="display: inline;"> This work introduces Open3DBench, an open-source 3D-IC backend implementation benchmark built upon the OpenROAD-flow-scripts framework, enabling comprehensive evaluation of power, performance, area, and thermal metrics. Our proposed flow supports modular integration of 3D partitioning, placement, 3D routing, RC extraction, and thermal simulation, aligning with advanced 3D flows that rely on commer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12946v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12946v1-abstract-full" style="display: none;"> This work introduces Open3DBench, an open-source 3D-IC backend implementation benchmark built upon the OpenROAD-flow-scripts framework, enabling comprehensive evaluation of power, performance, area, and thermal metrics. Our proposed flow supports modular integration of 3D partitioning, placement, 3D routing, RC extraction, and thermal simulation, aligning with advanced 3D flows that rely on commercial tools and in-house scripts. We present two foundational 3D placement algorithms: Open3D-Tiling, which emphasizes regular macro placement, and Open3D-DMP, which enhances wirelength optimization through cross-die co-placement with analytical placer DREAMPlace. Experimental results show significant improvements in area (51.19%), wirelength (24.06%), timing (30.84%), and power (5.72%) compared to 2D flows. The results also highlight that better wirelength does not necessarily lead to PPA gain, emphasizing the need of developing PPA-driven methods. Open3DBench offers a standardized, reproducible platform for evaluating 3D EDA methods, effectively bridging the gap between open-source tools and commercial solutions in 3D-IC design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12946v1-abstract-full').style.display = 'none'; document.getElementById('2503.12946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11935">arXiv:2503.11935</a> <span> [<a href="https://arxiv.org/pdf/2503.11935">pdf</a>, <a href="https://arxiv.org/format/2503.11935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Design of an Expression Recognition Solution Based on the Global Channel-Spatial Attention Mechanism and Proportional Criterion Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jun Yu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yang Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengfan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11935v3-abstract-short" style="display: inline;"> Facial expression recognition is a challenging classification task that holds broad application prospects in the field of human-computer interaction. This paper aims to introduce the method we will adopt in the 8th Affective and Behavioral Analysis in the Wild (ABAW) Competition, which will be held during the Conference on Computer Vision and Pattern Recognition (CVPR) in 2025.First of all, we app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11935v3-abstract-full').style.display = 'inline'; document.getElementById('2503.11935v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11935v3-abstract-full" style="display: none;"> Facial expression recognition is a challenging classification task that holds broad application prospects in the field of human-computer interaction. This paper aims to introduce the method we will adopt in the 8th Affective and Behavioral Analysis in the Wild (ABAW) Competition, which will be held during the Conference on Computer Vision and Pattern Recognition (CVPR) in 2025.First of all, we apply the frequency masking technique and the method of extracting data at equal time intervals to conduct targeted processing on the original videos. Then, based on the residual hybrid convolutional neural network and the multi-branch convolutional neural network respectively, we design feature extraction models for image and audio sequences. In particular, we propose a global channel-spatial attention mechanism to enhance the features initially extracted from both the audio and image modalities respectively.Finally, we adopt a decision fusion strategy based on the proportional criterion to fuse the classification results of the two single modalities, obtain an emotion probability vector, and output the final emotional classification. We also design a coarse - fine granularity loss function to optimize the performance of the entire network, which effectively improves the accuracy of facial expression recognition.In the facial expression recognition task of the 8th ABAW Competition, our method ranked third on the official validation set. This result fully confirms the effectiveness and competitiveness of the method we have proposed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11935v3-abstract-full').style.display = 'none'; document.getElementById('2503.11935v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11835">arXiv:2503.11835</a> <span> [<a href="https://arxiv.org/pdf/2503.11835">pdf</a>, <a href="https://arxiv.org/format/2503.11835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How Can Time Series Analysis Benefit From Multiple Modalities? A Survey and Outlook </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haoxin Liu</a>, <a href="/search/cs?searchtype=author&query=Kamarthi%2C+H">Harshavardhan Kamarthi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhiyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shangqing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&query=Hartvigsen%2C+T">Tom Hartvigsen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Prakash%2C+B+A">B. Aditya Prakash</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11835v2-abstract-short" style="display: inline;"> Time series analysis (TSA) is a longstanding research topic in the data mining community and has wide real-world significance. Compared to "richer" modalities such as language and vision, which have recently experienced explosive development and are densely connected, the time-series modality remains relatively underexplored and isolated. We notice that many recent TSA works have formed a new rese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11835v2-abstract-full').style.display = 'inline'; document.getElementById('2503.11835v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11835v2-abstract-full" style="display: none;"> Time series analysis (TSA) is a longstanding research topic in the data mining community and has wide real-world significance. Compared to "richer" modalities such as language and vision, which have recently experienced explosive development and are densely connected, the time-series modality remains relatively underexplored and isolated. We notice that many recent TSA works have formed a new research field, i.e., Multiple Modalities for TSA (MM4TSA). In general, these MM4TSA works follow a common motivation: how TSA can benefit from multiple modalities. This survey is the first to offer a comprehensive review and a detailed outlook for this emerging field. Specifically, we systematically discuss three benefits: (1) reusing foundation models of other modalities for efficient TSA, (2) multimodal extension for enhanced TSA, and (3) cross-modality interaction for advanced TSA. We further group the works by the introduced modality type, including text, images, audio, tables, and others, within each perspective. Finally, we identify the gaps with future opportunities, including the reused modalities selections, heterogeneous modality combinations, and unseen tasks generalizations, corresponding to the three benefits. We release an up-to-date GitHub repository that includes key papers and resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11835v2-abstract-full').style.display = 'none'; document.getElementById('2503.11835v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Github Repo: https://github.com/AdityaLab/MM4TSA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11674">arXiv:2503.11674</a> <span> [<a href="https://arxiv.org/pdf/2503.11674">pdf</a>, <a href="https://arxiv.org/format/2503.11674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Timing-Driven Global Placement by Efficient Critical Path Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunqi Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Siyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Kai%2C+S">Shixiong Kai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xi Lin</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+K">Ke Xue</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mingxuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chao Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11674v1-abstract-short" style="display: inline;"> Timing optimization during the global placement of integrated circuits has been a significant focus for decades, yet it remains a complex, unresolved issue. Recent analytical methods typically use pin-level timing information to adjust net weights, which is fast and simple but neglects the path-based nature of the timing graph. The existing path-based methods, however, cannot balance the accuracy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11674v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11674v1-abstract-full" style="display: none;"> Timing optimization during the global placement of integrated circuits has been a significant focus for decades, yet it remains a complex, unresolved issue. Recent analytical methods typically use pin-level timing information to adjust net weights, which is fast and simple but neglects the path-based nature of the timing graph. The existing path-based methods, however, cannot balance the accuracy and efficiency due to the exponential growth of number of critical paths. In this work, we propose a GPU-accelerated timing-driven global placement framework, integrating accurate path-level information into the efficient DREAMPlace infrastructure. It optimizes the fine-grained pin-to-pin attraction objective and is facilitated by efficient critical path extraction. We also design a quadratic distance loss function specifically to align with the RC timing model. Experimental results demonstrate that our method significantly outperforms the current leading timing-driven placers, achieving an average improvement of 40.5% in total negative slack (TNS) and 8.3% in worst negative slack (WNS), as well as an improvement in half-perimeter wirelength (HPWL). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11674v1-abstract-full').style.display = 'none'; document.getElementById('2503.11674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DATE'25 as a Best Paper Award</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11420">arXiv:2503.11420</a> <span> [<a href="https://arxiv.org/pdf/2503.11420">pdf</a>, <a href="https://arxiv.org/format/2503.11420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AQUA-SLAM: Tightly-Coupled Underwater Acoustic-Visual-Inertial SLAM with Sensor Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shida Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11420v1-abstract-short" style="display: inline;"> Underwater environments pose significant challenges for visual Simultaneous Localization and Mapping (SLAM) systems due to limited visibility, inadequate illumination, and sporadic loss of structural features in images. Addressing these challenges, this paper introduces a novel, tightly-coupled Acoustic-Visual-Inertial SLAM approach, termed AQUA-SLAM, to fuse a Doppler Velocity Log (DVL), a stereo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11420v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11420v1-abstract-full" style="display: none;"> Underwater environments pose significant challenges for visual Simultaneous Localization and Mapping (SLAM) systems due to limited visibility, inadequate illumination, and sporadic loss of structural features in images. Addressing these challenges, this paper introduces a novel, tightly-coupled Acoustic-Visual-Inertial SLAM approach, termed AQUA-SLAM, to fuse a Doppler Velocity Log (DVL), a stereo camera, and an Inertial Measurement Unit (IMU) within a graph optimization framework. Moreover, we propose an efficient sensor calibration technique, encompassing multi-sensor extrinsic calibration (among the DVL, camera and IMU) and DVL transducer misalignment calibration, with a fast linear approximation procedure for real-time online execution. The proposed methods are extensively evaluated in a tank environment with ground truth, and validated for offshore applications in the North Sea. The results demonstrate that our method surpasses current state-of-the-art underwater and visual-inertial SLAM systems in terms of localization accuracy and robustness. The proposed system will be made open-source for the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11420v1-abstract-full').style.display = 'none'; document.getElementById('2503.11420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10523">arXiv:2503.10523</a> <span> [<a href="https://arxiv.org/pdf/2503.10523">pdf</a>, <a href="https://arxiv.org/format/2503.10523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Multimodal Fusion with Temporal Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jun Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yang Zheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengfan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10523v1-abstract-short" style="display: inline;"> This paper presents our method for the estimation of valence-arousal (VA) in the 8th Affective Behavior Analysis in-the-Wild (ABAW) competition. Our approach integrates visual and audio information through a multimodal framework. The visual branch uses a pre-trained ResNet model to extract spatial features from facial images. The audio branches employ pre-trained VGG models to extract VGGish and L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10523v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10523v1-abstract-full" style="display: none;"> This paper presents our method for the estimation of valence-arousal (VA) in the 8th Affective Behavior Analysis in-the-Wild (ABAW) competition. Our approach integrates visual and audio information through a multimodal framework. The visual branch uses a pre-trained ResNet model to extract spatial features from facial images. The audio branches employ pre-trained VGG models to extract VGGish and LogMel features from speech signals. These features undergo temporal modeling using Temporal Convolutional Networks (TCNs). We then apply cross-modal attention mechanisms, where visual features interact with audio features through query-key-value attention structures. Finally, the features are concatenated and passed through a regression layer to predict valence and arousal. Our method achieves competitive performance on the Aff-Wild2 dataset, demonstrating effective multimodal fusion for VA estimation in-the-wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10523v1-abstract-full').style.display = 'none'; document.getElementById('2503.10523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10021">arXiv:2503.10021</a> <span> [<a href="https://arxiv.org/pdf/2503.10021">pdf</a>, <a href="https://arxiv.org/format/2503.10021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> DGNN: A Neural PDE Solver Induced by Discontinuous Galerkin Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengze Xu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+D">Dong Ni</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+T">Tieyong Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10021v2-abstract-short" style="display: inline;"> We propose a general framework for the Discontinuous Galerkin-induced Neural Network (DGNN), inspired by the Interior Penalty Discontinuous Galerkin Method (IPDGM). In this approach, the trial space consists of piecewise neural network space defined over the computational domain, while the test function space is composed of piecewise polynomials. We demonstrate the advantages of DGNN in terms of a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10021v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10021v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10021v2-abstract-full" style="display: none;"> We propose a general framework for the Discontinuous Galerkin-induced Neural Network (DGNN), inspired by the Interior Penalty Discontinuous Galerkin Method (IPDGM). In this approach, the trial space consists of piecewise neural network space defined over the computational domain, while the test function space is composed of piecewise polynomials. We demonstrate the advantages of DGNN in terms of accuracy and training efficiency across several numerical examples, including stationary and time-dependent problems. Specifically, DGNN easily handles high perturbations, discontinuous solutions, and complex geometric domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10021v2-abstract-full').style.display = 'none'; document.getElementById('2503.10021v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09641">arXiv:2503.09641</a> <span> [<a href="https://arxiv.org/pdf/2503.09641">pdf</a>, <a href="https://arxiv.org/format/2503.09641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> SANA-Sprint: One-Step Diffusion with Continuous-Time Consistency Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junsong Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+S">Shuchen Xue</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jincheng Yu</a>, <a href="/search/cs?searchtype=author&query=Paul%2C+S">Sayak Paul</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junyu Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Han Cai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+E">Enze Xie</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09641v2-abstract-short" style="display: inline;"> This paper presents SANA-Sprint, an efficient diffusion model for ultra-fast text-to-image (T2I) generation. SANA-Sprint is built on a pre-trained foundation model and augmented with hybrid distillation, dramatically reducing inference steps from 20 to 1-4. We introduce three key innovations: (1) We propose a training-free approach that transforms a pre-trained flow-matching model for continuous-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09641v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09641v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09641v2-abstract-full" style="display: none;"> This paper presents SANA-Sprint, an efficient diffusion model for ultra-fast text-to-image (T2I) generation. SANA-Sprint is built on a pre-trained foundation model and augmented with hybrid distillation, dramatically reducing inference steps from 20 to 1-4. We introduce three key innovations: (1) We propose a training-free approach that transforms a pre-trained flow-matching model for continuous-time consistency distillation (sCM), eliminating costly training from scratch and achieving high training efficiency. Our hybrid distillation strategy combines sCM with latent adversarial distillation (LADD): sCM ensures alignment with the teacher model, while LADD enhances single-step generation fidelity. (2) SANA-Sprint is a unified step-adaptive model that achieves high-quality generation in 1-4 steps, eliminating step-specific training and improving efficiency. (3) We integrate ControlNet with SANA-Sprint for real-time interactive image generation, enabling instant visual feedback for user interaction. SANA-Sprint establishes a new Pareto frontier in speed-quality tradeoffs, achieving state-of-the-art performance with 7.59 FID and 0.74 GenEval in only 1 step - outperforming FLUX-schnell (7.94 FID / 0.71 GenEval) while being 10x faster (0.1s vs 1.1s on H100). It also achieves 0.1s (T2I) and 0.25s (ControlNet) latency for 1024 x 1024 images on H100, and 0.31s (T2I) on an RTX 4090, showcasing its exceptional efficiency and potential for AI-powered consumer applications (AIPC). Code and pre-trained models will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09641v2-abstract-full').style.display = 'none'; document.getElementById('2503.09641v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 11 figures, 8 tables, In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09223">arXiv:2503.09223</a> <span> [<a href="https://arxiv.org/pdf/2503.09223">pdf</a>, <a href="https://arxiv.org/format/2503.09223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701716.3715246">10.1145/3701716.3715246 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LREF: A Novel LLM-based Relevance Framework for E-commerce </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tian Tang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zhixing Tian</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhenyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenyang Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haiqing Hu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+G">Guoyu Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sulong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09223v1-abstract-short" style="display: inline;"> Query and product relevance prediction is a critical component for ensuring a smooth user experience in e-commerce search. Traditional studies mainly focus on BERT-based models to assess the semantic relevance between queries and products. However, the discriminative paradigm and limited knowledge capacity of these approaches restrict their ability to comprehend the relevance between queries and p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09223v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09223v1-abstract-full" style="display: none;"> Query and product relevance prediction is a critical component for ensuring a smooth user experience in e-commerce search. Traditional studies mainly focus on BERT-based models to assess the semantic relevance between queries and products. However, the discriminative paradigm and limited knowledge capacity of these approaches restrict their ability to comprehend the relevance between queries and products fully. With the rapid advancement of Large Language Models (LLMs), recent research has begun to explore their application to industrial search systems, as LLMs provide extensive world knowledge and flexible optimization for reasoning processes. Nonetheless, directly leveraging LLMs for relevance prediction tasks introduces new challenges, including a high demand for data quality, the necessity for meticulous optimization of reasoning processes, and an optimistic bias that can result in over-recall. To overcome the above problems, this paper proposes a novel framework called the LLM-based RElevance Framework (LREF) aimed at enhancing e-commerce search relevance. The framework comprises three main stages: supervised fine-tuning (SFT) with Data Selection, Multiple Chain of Thought (Multi-CoT) tuning, and Direct Preference Optimization (DPO) for de-biasing. We evaluate the performance of the framework through a series of offline experiments on large-scale real-world datasets, as well as online A/B testing. The results indicate significant improvements in both offline and online metrics. Ultimately, the model was deployed in a well-known e-commerce application, yielding substantial commercial benefits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09223v1-abstract-full').style.display = 'none'; document.getElementById('2503.09223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08372">arXiv:2503.08372</a> <span> [<a href="https://arxiv.org/pdf/2503.08372">pdf</a>, <a href="https://arxiv.org/format/2503.08372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MetaFold: Language-Guided Multi-Category Garment Folding Framework via Trajectory Generation and Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junxiao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruihai Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yiwen Hou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhixuan Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingxiang Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chongkai Gao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenyu Wei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shensi Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaqi Huang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+L">Lin Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08372v1-abstract-short" style="display: inline;"> Garment folding is a common yet challenging task in robotic manipulation. The deformability of garments leads to a vast state space and complex dynamics, which complicates precise and fine-grained manipulation. Previous approaches often rely on predefined key points or demonstrations, limiting their generalization across diverse garment categories. This paper presents a framework, MetaFold, that d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08372v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08372v1-abstract-full" style="display: none;"> Garment folding is a common yet challenging task in robotic manipulation. The deformability of garments leads to a vast state space and complex dynamics, which complicates precise and fine-grained manipulation. Previous approaches often rely on predefined key points or demonstrations, limiting their generalization across diverse garment categories. This paper presents a framework, MetaFold, that disentangles task planning from action prediction, learning each independently to enhance model generalization. It employs language-guided point cloud trajectory generation for task planning and a low-level foundation model for action prediction. This structure facilitates multi-category learning, enabling the model to adapt flexibly to various user instructions and folding tasks. Experimental results demonstrate the superiority of our proposed framework. Supplementary materials are available on our website: https://meta-fold.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08372v1-abstract-full').style.display = 'none'; document.getElementById('2503.08372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07235">arXiv:2503.07235</a> <span> [<a href="https://arxiv.org/pdf/2503.07235">pdf</a>, <a href="https://arxiv.org/format/2503.07235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retinex-MEF: Retinex-based Glare Effects Aware Unsupervised Multi-Exposure Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+H">Haowen Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangshe Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zixiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Lilun Deng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yukun Cui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shuang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07235v1-abstract-short" style="display: inline;"> Multi-exposure image fusion consolidates multiple low dynamic range images of the same scene into a singular high dynamic range image. Retinex theory, which separates image illumination from scene reflectance, is naturally adopted to ensure consistent scene representation and effective information fusion across varied exposure levels. However, the conventional pixel-wise multiplication of illumina… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07235v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07235v1-abstract-full" style="display: none;"> Multi-exposure image fusion consolidates multiple low dynamic range images of the same scene into a singular high dynamic range image. Retinex theory, which separates image illumination from scene reflectance, is naturally adopted to ensure consistent scene representation and effective information fusion across varied exposure levels. However, the conventional pixel-wise multiplication of illumination and reflectance inadequately models the glare effect induced by overexposure. To better adapt this theory for multi-exposure image fusion, we introduce an unsupervised and controllable method termed~\textbf{(Retinex-MEF)}. Specifically, our method decomposes multi-exposure images into separate illumination components and a shared reflectance component, and effectively modeling the glare induced by overexposure. Employing a bidirectional loss constraint to learn the common reflectance component, our approach effectively mitigates the glare effect. Furthermore, we establish a controllable exposure fusion criterion, enabling global exposure adjustments while preserving contrast, thus overcoming the constraints of fixed-level fusion. A series of experiments across multiple datasets, including underexposure-overexposure fusion, exposure control fusion, and homogeneous extreme exposure fusion, demonstrate the effective decomposition and flexible fusion capability of our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07235v1-abstract-full').style.display = 'none'; document.getElementById('2503.07235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06983">arXiv:2503.06983</a> <span> [<a href="https://arxiv.org/pdf/2503.06983">pdf</a>, <a href="https://arxiv.org/format/2503.06983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Griffin: Aerial-Ground Cooperative Detection and Tracking Dataset and Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyu Cao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiaru Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuner Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haibao Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaobing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06983v1-abstract-short" style="display: inline;"> Despite significant advancements, autonomous driving systems continue to struggle with occluded objects and long-range detection due to the inherent limitations of single-perspective sensing. Aerial-ground cooperation offers a promising solution by integrating UAVs' aerial views with ground vehicles' local observations. However, progress in this emerging field has been hindered by the absence of p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06983v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06983v1-abstract-full" style="display: none;"> Despite significant advancements, autonomous driving systems continue to struggle with occluded objects and long-range detection due to the inherent limitations of single-perspective sensing. Aerial-ground cooperation offers a promising solution by integrating UAVs' aerial views with ground vehicles' local observations. However, progress in this emerging field has been hindered by the absence of public datasets and standardized evaluation benchmarks. To address this gap, this paper presents a comprehensive solution for aerial-ground cooperative 3D perception through three key contributions: (1) Griffin, a large-scale multi-modal dataset featuring over 200 dynamic scenes (30k+ frames) with varied UAV altitudes (20-60m), diverse weather conditions, and occlusion-aware 3D annotations, enhanced by CARLA-AirSim co-simulation for realistic UAV dynamics; (2) A unified benchmarking framework for aerial-ground cooperative detection and tracking tasks, including protocols for evaluating communication efficiency, latency tolerance, and altitude adaptability; (3) AGILE, an instance-level intermediate fusion baseline that dynamically aligns cross-view features through query-based interaction, achieving an advantageous balance between communication overhead and perception accuracy. Extensive experiments prove the effectiveness of aerial-ground cooperative perception and demonstrate the direction of further research. The dataset and codes are available at https://github.com/wang-jh18-SVM/Griffin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06983v1-abstract-full').style.display = 'none'; document.getElementById('2503.06983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures. This work has been submitted to IROS 2025 for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05367">arXiv:2503.05367</a> <span> [<a href="https://arxiv.org/pdf/2503.05367">pdf</a>, <a href="https://arxiv.org/format/2503.05367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Learning for Dose Prediction in Targeted Radionuclide: A Synthetic Data Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Bousse%2C+A">Alexandre Bousse</a>, <a href="/search/cs?searchtype=author&query=Imbert%2C+L">Laetitia Imbert</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+S">Song Xue</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kuangyu Shi</a>, <a href="/search/cs?searchtype=author&query=Bert%2C+J">Julien Bert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05367v1-abstract-short" style="display: inline;"> Targeted Radionuclide Therapy (TRT) is a modern strategy in radiation oncology that aims to administer a potent radiation dose specifically to cancer cells using cancer-targeting radiopharmaceuticals. Accurate radiation dose estimation tailored to individual patients is crucial. Deep learning, particularly with pre-therapy imaging, holds promise for personalizing TRT doses. However, current method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05367v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05367v1-abstract-full" style="display: none;"> Targeted Radionuclide Therapy (TRT) is a modern strategy in radiation oncology that aims to administer a potent radiation dose specifically to cancer cells using cancer-targeting radiopharmaceuticals. Accurate radiation dose estimation tailored to individual patients is crucial. Deep learning, particularly with pre-therapy imaging, holds promise for personalizing TRT doses. However, current methods require large time series of SPECT imaging, which is hardly achievable in routine clinical practice, and thus raises issues of data availability. Our objective is to develop a semi-supervised learning (SSL) solution to personalize dosimetry using pre-therapy images. The aim is to develop an approach that achieves accurate results when PET/CT images are available, but are associated with only a few post-therapy dosimetry data provided by SPECT images. In this work, we introduce an SSL method using a pseudo-label generation approach for regression tasks inspired by the FixMatch framework. The feasibility of the proposed solution was preliminarily evaluated through an in-silico study using synthetic data and Monte Carlo simulation. Experimental results for organ dose prediction yielded promising outcomes, showing that the use of pseudo-labeled data provides better accuracy compared to using only labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05367v1-abstract-full').style.display = 'none'; document.getElementById('2503.05367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 13 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04483">arXiv:2503.04483</a> <span> [<a href="https://arxiv.org/pdf/2503.04483">pdf</a>, <a href="https://arxiv.org/format/2503.04483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> InfoSEM: A Deep Generative Model with Informative Priors for Gene Regulatory Network Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+T">Tianyu Cui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Song-Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Moskalev%2C+A">Artem Moskalev</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuwei Li</a>, <a href="/search/cs?searchtype=author&query=Mansi%2C+T">Tommaso Mansi</a>, <a href="/search/cs?searchtype=author&query=Prakash%2C+M">Mangal Prakash</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+R">Rui Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04483v1-abstract-short" style="display: inline;"> Inferring Gene Regulatory Networks (GRNs) from gene expression data is crucial for understanding biological processes. While supervised models are reported to achieve high performance for this task, they rely on costly ground truth (GT) labels and risk learning gene-specific biases, such as class imbalances of GT interactions, rather than true regulatory mechanisms. To address these issues, we int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04483v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04483v1-abstract-full" style="display: none;"> Inferring Gene Regulatory Networks (GRNs) from gene expression data is crucial for understanding biological processes. While supervised models are reported to achieve high performance for this task, they rely on costly ground truth (GT) labels and risk learning gene-specific biases, such as class imbalances of GT interactions, rather than true regulatory mechanisms. To address these issues, we introduce InfoSEM, an unsupervised generative model that leverages textual gene embeddings as informative priors, improving GRN inference without GT labels. InfoSEM can also integrate GT labels as an additional prior when available, avoiding biases and further enhancing performance. Additionally, we propose a biologically motivated benchmarking framework that better reflects real-world applications such as biomarker discovery and reveals learned biases of existing supervised methods. InfoSEM outperforms existing models by 38.5% across four datasets using textual embeddings prior and further boosts performance by 11.1% when integrating labeled data as priors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04483v1-abstract-full').style.display = 'none'; document.getElementById('2503.04483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025 AI4NA Oral, ICLR 2025 MLGenX Spotlight, ICLR 2025 LMRL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03704">arXiv:2503.03704</a> <span> [<a href="https://arxiv.org/pdf/2503.03704">pdf</a>, <a href="https://arxiv.org/format/2503.03704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Practical Memory Injection Attack against LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shen Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaochen Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+P">Pengfei He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiliang Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Z">Zhen Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03704v2-abstract-short" style="display: inline;"> Agents based on large language models (LLMs) have demonstrated strong capabilities in a wide range of complex, real-world applications. However, LLM agents with a compromised memory bank may easily produce harmful outputs when the past records retrieved for demonstration are malicious. In this paper, we propose a novel Memory INJection Attack, MINJA, that enables the injection of malicious records… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03704v2-abstract-full').style.display = 'inline'; document.getElementById('2503.03704v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03704v2-abstract-full" style="display: none;"> Agents based on large language models (LLMs) have demonstrated strong capabilities in a wide range of complex, real-world applications. However, LLM agents with a compromised memory bank may easily produce harmful outputs when the past records retrieved for demonstration are malicious. In this paper, we propose a novel Memory INJection Attack, MINJA, that enables the injection of malicious records into the memory bank by only interacting with the agent via queries and output observations. These malicious records are designed to elicit a sequence of malicious reasoning steps leading to undesirable agent actions when executing the victim user's query. Specifically, we introduce a sequence of bridging steps to link the victim query to the malicious reasoning steps. During the injection of the malicious record, we propose an indication prompt to guide the agent to autonomously generate our designed bridging steps. We also propose a progressive shortening strategy that gradually removes the indication prompt, such that the malicious record will be easily retrieved when processing the victim query comes after. Our extensive experiments across diverse agents demonstrate the effectiveness of MINJA in compromising agent memory. With minimal requirements for execution, MINJA enables any user to influence agent memory, highlighting practical risks of LLM agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03704v2-abstract-full').style.display = 'none'; document.getElementById('2503.03704v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03698">arXiv:2503.03698</a> <span> [<a href="https://arxiv.org/pdf/2503.03698">pdf</a>, <a href="https://arxiv.org/format/2503.03698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> AEGIS: Towards Formalized and Practical Memory-Safe Execution of C programs via MSWASM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Esmaeilsabzali%2C+S">Shahram Esmaeilsabzali</a>, <a href="/search/cs?searchtype=author&query=Khalatyan%2C+A">Arayi Khalatyan</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+Z">Zhijun Mo</a>, <a href="/search/cs?searchtype=author&query=Venkatanarayanan%2C+S">Sruthi Venkatanarayanan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengjie Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03698v1-abstract-short" style="display: inline;"> Programs written in unsafe languages such as C are prone to memory safety errors, which can lead to program compromises and serious real-world security consequences. Recently, Memory-Safe WebAssembly (MSWASM) is introduced as a general-purpose intermediate bytecode with built-in memory safety semantics. Programs written in C can be compiled into MSWASM to get complete memory safety protection. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03698v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03698v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03698v1-abstract-full" style="display: none;"> Programs written in unsafe languages such as C are prone to memory safety errors, which can lead to program compromises and serious real-world security consequences. Recently, Memory-Safe WebAssembly (MSWASM) is introduced as a general-purpose intermediate bytecode with built-in memory safety semantics. Programs written in C can be compiled into MSWASM to get complete memory safety protection. In this paper, we present our extensions on MSWASM, which improve its semantics and practicality. First, we formalize MSWASM semantics in Coq/Iris, extending it with inter-module interaction, showing that MSWASM provides fine-grained isolation guarantees analogous to WASM's coarse-grained isolation via linear memory. Second, we present Aegis, a system to adopt the memory safety of MSWASM for C programs in an interoperable way. Aegis pipeline generates Checked C source code from MSWASM modules to enforce spatial memory safety. Checked C is a recent binary-compatible extension of C which can provide guaranteed spatial safety. Our design allows Aegis to protect C programs that depend on legacy C libraries with no extra dependency and with low overhead. Aegis pipeline incurs 67% runtime overhead and near-zero memory overhead on PolyBenchC programs compared to native. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03698v1-abstract-full').style.display = 'none'; document.getElementById('2503.03698v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.3.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03125">arXiv:2503.03125</a> <span> [<a href="https://arxiv.org/pdf/2503.03125">pdf</a>, <a href="https://arxiv.org/format/2503.03125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Don't Shake the Wheel: Momentum-Aware Planning in End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Z">Ziying Song</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Caiyan Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hongyu Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongchang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junming Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaoqing Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yadan Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03125v2-abstract-short" style="display: inline;"> End-to-end autonomous driving frameworks enable seamless integration of perception and planning but often rely on one-shot trajectory prediction, which may lead to unstable control and vulnerability to occlusions in single-frame perception. To address this, we propose the Momentum-Aware Driving (MomAD) framework, which introduces trajectory momentum and perception momentum to stabilize and refine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03125v2-abstract-full').style.display = 'inline'; document.getElementById('2503.03125v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03125v2-abstract-full" style="display: none;"> End-to-end autonomous driving frameworks enable seamless integration of perception and planning but often rely on one-shot trajectory prediction, which may lead to unstable control and vulnerability to occlusions in single-frame perception. To address this, we propose the Momentum-Aware Driving (MomAD) framework, which introduces trajectory momentum and perception momentum to stabilize and refine trajectory predictions. MomAD comprises two core components: (1) Topological Trajectory Matching (TTM) employs Hausdorff Distance to select the optimal planning query that aligns with prior paths to ensure coherence;(2) Momentum Planning Interactor (MPI) cross-attends the selected planning query with historical queries to expand static and dynamic perception files. This enriched query, in turn, helps regenerate long-horizon trajectory and reduce collision risks. To mitigate noise arising from dynamic environments and detection errors, we introduce robust instance denoising during training, enabling the planning model to focus on critical signals and improve its robustness. We also propose a novel Trajectory Prediction Consistency (TPC) metric to quantitatively assess planning stability. Experiments on the nuScenes dataset demonstrate that MomAD achieves superior long-term consistency (>=3s) compared to SOTA methods. Moreover, evaluations on the curated Turning-nuScenes shows that MomAD reduces the collision rate by 26% and improves TPC by 0.97m (33.45%) over a 6s prediction horizon, while closedloop on Bench2Drive demonstrates an up to 16.3% improvement in success rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03125v2-abstract-full').style.display = 'none'; document.getElementById('2503.03125v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02334">arXiv:2503.02334</a> <span> [<a href="https://arxiv.org/pdf/2503.02334">pdf</a>, <a href="https://arxiv.org/format/2503.02334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BiasICL: In-Context Learning and Demographic Biases of Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sonnet Xu</a>, <a href="/search/cs?searchtype=author&query=Janizek%2C+J">Joseph Janizek</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yixing Jiang</a>, <a href="/search/cs?searchtype=author&query=Daneshjou%2C+R">Roxana Daneshjou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02334v1-abstract-short" style="display: inline;"> Vision language models (VLMs) show promise in medical diagnosis, but their performance across demographic subgroups when using in-context learning (ICL) remains poorly understood. We examine how the demographic composition of demonstration examples affects VLM performance in two medical imaging tasks: skin lesion malignancy prediction and pneumothorax detection from chest radiographs. Our analysis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02334v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02334v1-abstract-full" style="display: none;"> Vision language models (VLMs) show promise in medical diagnosis, but their performance across demographic subgroups when using in-context learning (ICL) remains poorly understood. We examine how the demographic composition of demonstration examples affects VLM performance in two medical imaging tasks: skin lesion malignancy prediction and pneumothorax detection from chest radiographs. Our analysis reveals that ICL influences model predictions through multiple mechanisms: (1) ICL allows VLMs to learn subgroup-specific disease base rates from prompts and (2) ICL leads VLMs to make predictions that perform differently across demographic groups, even after controlling for subgroup-specific disease base rates. Our empirical results inform best-practices for prompting current VLMs (specifically examining demographic subgroup performance, and matching base rates of labels to target distribution at a bulk level and within subgroups), while also suggesting next steps for improving our theoretical understanding of these models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02334v1-abstract-full').style.display = 'none'; document.getElementById('2503.02334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20509">arXiv:2502.20509</a> <span> [<a href="https://arxiv.org/pdf/2502.20509">pdf</a>, <a href="https://arxiv.org/format/2502.20509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoCa-CXR: Contrastive Captioners Learn Strong Temporal Structures for Chest X-Ray Vision-Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixiong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shawn Xu</a>, <a href="/search/cs?searchtype=author&query=Sellergren%2C+A">Andrew Sellergren</a>, <a href="/search/cs?searchtype=author&query=Matias%2C+Y">Yossi Matias</a>, <a href="/search/cs?searchtype=author&query=Hassidim%2C+A">Avinatan Hassidim</a>, <a href="/search/cs?searchtype=author&query=Shetty%2C+S">Shravya Shetty</a>, <a href="/search/cs?searchtype=author&query=Golden%2C+D">Daniel Golden</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20509v1-abstract-short" style="display: inline;"> Vision-language models have proven to be of great benefit for medical image analysis since they learn rich semantics from both images and reports. Prior efforts have focused on better alignment of image and text representations to enhance image understanding. However, though explicit reference to a prior image is common in Chest X-Ray (CXR) reports, aligning progression descriptions with the seman… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20509v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20509v1-abstract-full" style="display: none;"> Vision-language models have proven to be of great benefit for medical image analysis since they learn rich semantics from both images and reports. Prior efforts have focused on better alignment of image and text representations to enhance image understanding. However, though explicit reference to a prior image is common in Chest X-Ray (CXR) reports, aligning progression descriptions with the semantics differences in image pairs remains under-explored. In this work, we propose two components to address this issue. (1) A CXR report processing pipeline to extract temporal structure. It processes reports with a large language model (LLM) to separate the description and comparison contexts, and extracts fine-grained annotations from reports. (2) A contrastive captioner model for CXR, namely CoCa-CXR, to learn how to both describe images and their temporal progressions. CoCa-CXR incorporates a novel regional cross-attention module to identify local differences between paired CXR images. Extensive experiments show the superiority of CoCa-CXR on both progression analysis and report generation compared to previous methods. Notably, on MS-CXR-T progression classification, CoCa-CXR obtains 65.0% average testing accuracy on five pulmonary conditions, outperforming the previous state-of-the-art (SOTA) model BioViL-T by 4.8%. It also achieves a RadGraph F1 of 24.2% on MIMIC-CXR, which is comparable to the Med-Gemini foundation model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20509v1-abstract-full').style.display = 'none'; document.getElementById('2502.20509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20390">arXiv:2502.20390</a> <span> [<a href="https://arxiv.org/pdf/2502.20390">pdf</a>, <a href="https://arxiv.org/format/2502.20390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> InterMimic: Towards Universal Whole-Body Control for Physics-Based Human-Object Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sirui Xu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+H+Y">Hung Yu Ling</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiong Wang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+L">Liang-Yan Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20390v1-abstract-short" style="display: inline;"> Achieving realistic simulations of humans interacting with a wide range of objects has long been a fundamental goal. Extending physics-based motion imitation to complex human-object interactions (HOIs) is challenging due to intricate human-object coupling, variability in object geometries, and artifacts in motion capture data, such as inaccurate contacts and limited hand detail. We introduce Inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20390v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20390v1-abstract-full" style="display: none;"> Achieving realistic simulations of humans interacting with a wide range of objects has long been a fundamental goal. Extending physics-based motion imitation to complex human-object interactions (HOIs) is challenging due to intricate human-object coupling, variability in object geometries, and artifacts in motion capture data, such as inaccurate contacts and limited hand detail. We introduce InterMimic, a framework that enables a single policy to robustly learn from hours of imperfect MoCap data covering diverse full-body interactions with dynamic and varied objects. Our key insight is to employ a curriculum strategy -- perfect first, then scale up. We first train subject-specific teacher policies to mimic, retarget, and refine motion capture data. Next, we distill these teachers into a student policy, with the teachers acting as online experts providing direct supervision, as well as high-quality references. Notably, we incorporate RL fine-tuning on the student policy to surpass mere demonstration replication and achieve higher-quality solutions. Our experiments demonstrate that InterMimic produces realistic and diverse interactions across multiple HOI datasets. The learned policy generalizes in a zero-shot manner and seamlessly integrates with kinematic generators, elevating the framework from mere imitation to generative modeling of complex human-object interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20390v1-abstract-full').style.display = 'none'; document.getElementById('2502.20390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025. Project Page: https://sirui-xu.github.io/InterMimic/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19991">arXiv:2502.19991</a> <span> [<a href="https://arxiv.org/pdf/2502.19991">pdf</a>, <a href="https://arxiv.org/format/2502.19991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Object Handover in a Robot Crafting Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+L">Leimin Tian</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shiyu Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Kerry He</a>, <a href="/search/cs?searchtype=author&query=Love%2C+R">Rachel Love</a>, <a href="/search/cs?searchtype=author&query=Cosgun%2C+A">Akansel Cosgun</a>, <a href="/search/cs?searchtype=author&query=Kulic%2C+D">Dana Kulic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19991v1-abstract-short" style="display: inline;"> Robots are increasingly working alongside people, delivering food to patrons in restaurants or helping workers on assembly lines. These scenarios often involve object handovers between the person and the robot. To achieve safe and efficient human-robot collaboration (HRC), it is important to incorporate human context in a robot's handover strategies. Therefore, in this work, we develop a collabora… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19991v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19991v1-abstract-full" style="display: none;"> Robots are increasingly working alongside people, delivering food to patrons in restaurants or helping workers on assembly lines. These scenarios often involve object handovers between the person and the robot. To achieve safe and efficient human-robot collaboration (HRC), it is important to incorporate human context in a robot's handover strategies. Therefore, in this work, we develop a collaborative handover model trained on human teleoperation data collected in a naturalistic crafting task. To evaluate the performance of this model, we conduct cross-validation experiments on the training dataset as well as a user study in the same HRC crafting task. The handover episodes and user perceptions of the autonomous handover policy were compared with those of the human teleoperated handovers. While the cross-validation experiment and user study indicate that the autonomous policy successfully achieved collaborative handovers, the comparison with human teleoperation revealed avenues for further improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19991v1-abstract-full').style.display = 'none'; document.getElementById('2502.19991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19628">arXiv:2502.19628</a> <span> [<a href="https://arxiv.org/pdf/2502.19628">pdf</a>, <a href="https://arxiv.org/format/2502.19628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701716.3715589">10.1145/3701716.3715589 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> PCL: Prompt-based Continual Learning for User Modeling in Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingdai Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanhui Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaoyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianchen Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yetian Chen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Simone Shao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jia Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yan Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19628v1-abstract-short" style="display: inline;"> User modeling in large e-commerce platforms aims to optimize user experiences by incorporating various customer activities. Traditional models targeting a single task often focus on specific business metrics, neglecting the comprehensive user behavior, and thus limiting their effectiveness. To develop more generalized user representations, some existing work adopts Multi-task Learning (MTL)approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19628v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19628v1-abstract-full" style="display: none;"> User modeling in large e-commerce platforms aims to optimize user experiences by incorporating various customer activities. Traditional models targeting a single task often focus on specific business metrics, neglecting the comprehensive user behavior, and thus limiting their effectiveness. To develop more generalized user representations, some existing work adopts Multi-task Learning (MTL)approaches. But they all face the challenges of optimization imbalance and inefficiency in adapting to new tasks. Continual Learning (CL), which allows models to learn new tasks incrementally and independently, has emerged as a solution to MTL's limitations. However, CL faces the challenge of catastrophic forgetting, where previously learned knowledge is lost when the model is learning the new task. Inspired by the success of prompt tuning in Pretrained Language Models (PLMs), we propose PCL, a Prompt-based Continual Learning framework for user modeling, which utilizes position-wise prompts as external memory for each task, preserving knowledge and mitigating catastrophic forgetting. Additionally, we design contextual prompts to capture and leverage inter-task relationships during prompt tuning. We conduct extensive experiments on real-world datasets to demonstrate PCL's effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19628v1-abstract-full').style.display = 'none'; document.getElementById('2502.19628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages. Accepted by www'25 as short paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19313">arXiv:2502.19313</a> <span> [<a href="https://arxiv.org/pdf/2502.19313">pdf</a>, <a href="https://arxiv.org/format/2502.19313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoopDETR: A Unified Cooperative Perception Framework for 3D Detection via Object Query </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaocong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+X">Xucai Zhuang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingjing Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yilun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ya-Qin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19313v1-abstract-short" style="display: inline;"> Cooperative perception enhances the individual perception capabilities of autonomous vehicles (AVs) by providing a comprehensive view of the environment. However, balancing perception performance and transmission costs remains a significant challenge. Current approaches that transmit region-level features across agents are limited in interpretability and demand substantial bandwidth, making them u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19313v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19313v1-abstract-full" style="display: none;"> Cooperative perception enhances the individual perception capabilities of autonomous vehicles (AVs) by providing a comprehensive view of the environment. However, balancing perception performance and transmission costs remains a significant challenge. Current approaches that transmit region-level features across agents are limited in interpretability and demand substantial bandwidth, making them unsuitable for practical applications. In this work, we propose CoopDETR, a novel cooperative perception framework that introduces object-level feature cooperation via object query. Our framework consists of two key modules: single-agent query generation, which efficiently encodes raw sensor data into object queries, reducing transmission cost while preserving essential information for detection; and cross-agent query fusion, which includes Spatial Query Matching (SQM) and Object Query Aggregation (OQA) to enable effective interaction between queries. Our experiments on the OPV2V and V2XSet datasets demonstrate that CoopDETR achieves state-of-the-art performance and significantly reduces transmission costs to 1/782 of previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19313v1-abstract-full').style.display = 'none'; document.getElementById('2502.19313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18600">arXiv:2502.18600</a> <span> [<a href="https://arxiv.org/pdf/2502.18600">pdf</a>, <a href="https://arxiv.org/format/2502.18600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chain of Draft: Thinking Faster by Writing Less </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Silei Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wenhao Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingxiao Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+P">Pengcheng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18600v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable performance in solving complex reasoning tasks through mechanisms like Chain-of-Thought (CoT) prompting, which emphasizes verbose, step-by-step reasoning. However, humans typically employ a more efficient strategy: drafting concise intermediate thoughts that capture only essential information. In this work, we propose Chain of Draft (CoD),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18600v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18600v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18600v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable performance in solving complex reasoning tasks through mechanisms like Chain-of-Thought (CoT) prompting, which emphasizes verbose, step-by-step reasoning. However, humans typically employ a more efficient strategy: drafting concise intermediate thoughts that capture only essential information. In this work, we propose Chain of Draft (CoD), a novel paradigm inspired by human cognitive processes, where LLMs generate minimalistic yet informative intermediate reasoning outputs while solving tasks. By reducing verbosity and focusing on critical insights, CoD matches or surpasses CoT in accuracy while using as little as only 7.6% of the tokens, significantly reducing cost and latency across various reasoning tasks. Our code and data are available at https://github.com/sileix/chain-of-draft. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18600v2-abstract-full').style.display = 'none'; document.getElementById('2502.18600v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18546">arXiv:2502.18546</a> <span> [<a href="https://arxiv.org/pdf/2502.18546">pdf</a>, <a href="https://arxiv.org/format/2502.18546">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-class Seismic Building Damage Assessment from InSAR Imagery using Quadratic Variational Causal Bayesian Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuechun Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Susu Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18546v1-abstract-short" style="display: inline;"> Interferometric Synthetic Aperture Radar (InSAR) technology uses satellite radar to detect surface deformation patterns and monitor earthquake impacts on buildings. While vital for emergency response planning, extracting multi-class building damage classifications from InSAR data faces challenges: overlapping damage signatures with environmental noise, computational complexity in multi-class scena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18546v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18546v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18546v1-abstract-full" style="display: none;"> Interferometric Synthetic Aperture Radar (InSAR) technology uses satellite radar to detect surface deformation patterns and monitor earthquake impacts on buildings. While vital for emergency response planning, extracting multi-class building damage classifications from InSAR data faces challenges: overlapping damage signatures with environmental noise, computational complexity in multi-class scenarios, and the need for rapid regional-scale processing. Our novel multi-class variational causal Bayesian inference framework with quadratic variational bounds provides rigorous approximations while ensuring efficiency. By integrating InSAR observations with USGS ground failure models and building fragility functions, our approach separates building damage signals while maintaining computational efficiency through strategic pruning. Evaluation across five major earthquakes (Haiti 2021, Puerto Rico 2020, Zagreb 2020, Italy 2016, Ridgecrest 2019) shows improved damage classification accuracy (AUC: 0.94-0.96), achieving up to 35.7% improvement over existing methods. Our approach maintains high accuracy (AUC > 0.93) across all damage categories while reducing computational overhead by over 40% without requiring extensive ground truth data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18546v1-abstract-full').style.display = 'none'; document.getElementById('2502.18546v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Remote Sensing and Environment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18282">arXiv:2502.18282</a> <span> [<a href="https://arxiv.org/pdf/2502.18282">pdf</a>, <a href="https://arxiv.org/format/2502.18282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Better Aligned with Survey Respondents or Training Data? Unveiling Political Leanings of LLMs on U.S. Supreme Court Cases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shanshan Xu</a>, <a href="/search/cs?searchtype=author&query=Santosh%2C+T+Y+S+S">T. Y. S. S Santosh</a>, <a href="/search/cs?searchtype=author&query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&query=Vogel%2C+Q">Quirin Vogel</a>, <a href="/search/cs?searchtype=author&query=Plank%2C+B">Barbara Plank</a>, <a href="/search/cs?searchtype=author&query=Grabmair%2C+M">Matthias Grabmair</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18282v2-abstract-short" style="display: inline;"> The increased adoption of Large Language Models (LLMs) and their potential to shape public opinion have sparked interest in assessing these models' political leanings. Building on previous research that compared LLMs and human opinions and observed political bias in system responses, we take a step further to investigate the underlying causes of such biases by empirically examining how the values… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18282v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18282v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18282v2-abstract-full" style="display: none;"> The increased adoption of Large Language Models (LLMs) and their potential to shape public opinion have sparked interest in assessing these models' political leanings. Building on previous research that compared LLMs and human opinions and observed political bias in system responses, we take a step further to investigate the underlying causes of such biases by empirically examining how the values and biases embedded in training corpora shape model outputs. Specifically, we propose a method to quantitatively evaluate political leanings embedded in the large pretraining corpora. Subsequently we investigate to whom are the LLMs' political leanings more aligned with, their pretrainig corpora or the surveyed human opinions. As a case study, we focus on probing the political leanings of LLMs in 32 U.S. Supreme Court cases, addressing contentious topics such as abortion and voting rights. Our findings reveal that LLMs strongly reflect the political leanings in their training data, and no strong correlation is observed with their alignment to human opinions as expressed in surveys. These results underscore the importance of responsible curation of training data and the need for robust evaluation metrics to ensure LLMs' alignment with human-centered values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18282v2-abstract-full').style.display = 'none'; document.getElementById('2502.18282v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18049">arXiv:2502.18049</a> <span> [<a href="https://arxiv.org/pdf/2502.18049">pdf</a>, <a href="https://arxiv.org/format/2502.18049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Golden Ratio Weighting Prevents Model Collapse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Hengzhi He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shirong Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18049v2-abstract-short" style="display: inline;"> Recent studies identified an intriguing phenomenon in recursive generative model training known as model collapse, where models trained on data generated by previous models exhibit severe performance degradation. Addressing this issue and developing more effective training strategies have become central challenges in generative model research. In this paper, we investigate this phenomenon theoreti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18049v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18049v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18049v2-abstract-full" style="display: none;"> Recent studies identified an intriguing phenomenon in recursive generative model training known as model collapse, where models trained on data generated by previous models exhibit severe performance degradation. Addressing this issue and developing more effective training strategies have become central challenges in generative model research. In this paper, we investigate this phenomenon theoretically within a novel framework, where generative models are iteratively trained on a combination of newly collected real data and synthetic data from the previous training step. To develop an optimal training strategy for integrating real and synthetic data, we evaluate the performance of a weighted training scheme in various scenarios, including Gaussian distribution estimation and linear regression. We theoretically characterize the impact of the mixing proportion and weighting scheme of synthetic data on the final model's performance. Our key finding is that, across different settings, the optimal weighting scheme under different proportions of synthetic data asymptotically follows a unified expression, revealing a fundamental trade-off between leveraging synthetic data and generative model performance. Notably, in some cases, the optimal weight assigned to real data corresponds to the reciprocal of the golden ratio. Finally, we validate our theoretical results on extensive simulated datasets and a real tabular dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18049v2-abstract-full').style.display = 'none'; document.getElementById('2502.18049v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17701">arXiv:2502.17701</a> <span> [<a href="https://arxiv.org/pdf/2502.17701">pdf</a>, <a href="https://arxiv.org/format/2502.17701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Perceptions to Decisions: Wildfire Evacuation Decision Prediction with Behavioral Theory-informed LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenguang Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuran Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xilei Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Susu Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17701v1-abstract-short" style="display: inline;"> Evacuation decision prediction is critical for efficient and effective wildfire response by helping emergency management anticipate traffic congestion and bottlenecks, allocate resources, and minimize negative impacts. Traditional statistical methods for evacuation decision prediction fail to capture the complex and diverse behavioral logic of different individuals. In this work, for the first tim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17701v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17701v1-abstract-full" style="display: none;"> Evacuation decision prediction is critical for efficient and effective wildfire response by helping emergency management anticipate traffic congestion and bottlenecks, allocate resources, and minimize negative impacts. Traditional statistical methods for evacuation decision prediction fail to capture the complex and diverse behavioral logic of different individuals. In this work, for the first time, we introduce FLARE, short for facilitating LLM for advanced reasoning on wildfire evacuation decision prediction, a Large Language Model (LLM)-based framework that integrates behavioral theories and models to streamline the Chain-of-Thought (CoT) reasoning and subsequently integrate with memory-based Reinforcement Learning (RL) module to provide accurate evacuation decision prediction and understanding. Our proposed method addresses the limitations of using existing LLMs for evacuation behavioral predictions, such as limited survey data, mismatching with behavioral theory, conflicting individual preferences, implicit and complex mental states, and intractable mental state-behavior mapping. Experiments on three post-wildfire survey datasets show an average of 20.47% performance improvement over traditional theory-informed behavioral models, with strong cross-event generalizability. Our complete code is publicly available at https://github.com/SusuXu-s-Lab/FLARE <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17701v1-abstract-full').style.display = 'none'; document.getElementById('2502.17701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17536">arXiv:2502.17536</a> <span> [<a href="https://arxiv.org/pdf/2502.17536">pdf</a>, <a href="https://arxiv.org/format/2502.17536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CLEP-GAN: An Innovative Approach to Subject-Independent ECG Reconstruction from PPG Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyan Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shixin Xu</a>, <a href="/search/cs?searchtype=author&query=Habib%2C+F">Faisal Habib</a>, <a href="/search/cs?searchtype=author&query=Aminnejad%2C+N">Neda Aminnejad</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Arvind Gupta</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huaxiong Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17536v1-abstract-short" style="display: inline;"> This study addresses the challenge of reconstructing unseen ECG signals from PPG signals, a critical task for non-invasive cardiac monitoring. While numerous public ECG-PPG datasets are available, they lack the diversity seen in image datasets, and data collection processes often introduce noise, complicating ECG reconstruction from PPG even with advanced machine learning models. To tackle these c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17536v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17536v1-abstract-full" style="display: none;"> This study addresses the challenge of reconstructing unseen ECG signals from PPG signals, a critical task for non-invasive cardiac monitoring. While numerous public ECG-PPG datasets are available, they lack the diversity seen in image datasets, and data collection processes often introduce noise, complicating ECG reconstruction from PPG even with advanced machine learning models. To tackle these challenges, we first introduce a novel synthetic ECG-PPG data generation technique using an ODE model to enhance training diversity. Next, we develop a novel subject-independent PPG-to-ECG reconstruction model that integrates contrastive learning, adversarial learning, and attention gating, achieving results comparable to or even surpassing existing approaches for unseen ECG reconstruction. Finally, we examine factors such as sex and age that impact reconstruction accuracy, emphasizing the importance of considering demographic diversity during model training and dataset augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17536v1-abstract-full').style.display = 'none'; document.getElementById('2502.17536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17494">arXiv:2502.17494</a> <span> [<a href="https://arxiv.org/pdf/2502.17494">pdf</a>, <a href="https://arxiv.org/format/2502.17494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> External Large Foundation Model: How to Efficiently Serve Trillions of Parameters for Online Ads Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+M">Mingfu Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Rong Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boyang Liu</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+Q">Qiuling Suo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qinghai Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Song Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Laming Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hua Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shali Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiyan Yang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiaozhen Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Badr%2C+Y">Yasmine Badr</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+E">Ellie Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shuyu Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hansey Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jade Nie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhichen Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weilin Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xingliang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qianru Li</a> , et al. (77 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17494v4-abstract-short" style="display: inline;"> Ads recommendation is a prominent service of online advertising systems and has been actively studied. Recent studies indicate that scaling-up and advanced design of the recommendation model can bring significant performance improvement. However, with a larger model scale, such prior studies have a significantly increasing gap from industry as they often neglect two fundamental challenges in indus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17494v4-abstract-full').style.display = 'inline'; document.getElementById('2502.17494v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17494v4-abstract-full" style="display: none;"> Ads recommendation is a prominent service of online advertising systems and has been actively studied. Recent studies indicate that scaling-up and advanced design of the recommendation model can bring significant performance improvement. However, with a larger model scale, such prior studies have a significantly increasing gap from industry as they often neglect two fundamental challenges in industrial-scale applications. First, training and inference budgets are restricted for the model to be served, exceeding which may incur latency and impair user experience. Second, large-volume data arrive in a streaming mode with data distributions dynamically shifting, as new users/ads join and existing users/ads leave the system. We propose the External Large Foundation Model (ExFM) framework to address the overlooked challenges. Specifically, we develop external distillation and a data augmentation system (DAS) to control the computational cost of training/inference while maintaining high performance. We design the teacher in a way like a foundation model (FM) that can serve multiple students as vertical models (VMs) to amortize its building cost. We propose Auxiliary Head and Student Adapter to mitigate the data distribution gap between FM and VMs caused by the streaming data issue. Comprehensive experiments on internal industrial-scale applications and public datasets demonstrate significant performance gain by ExFM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17494v4-abstract-full').style.display = 'none'; document.getElementById('2502.17494v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the ACM Web Conference (WWW) 2025 Industrial Track as Oral Presentation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17099">arXiv:2502.17099</a> <span> [<a href="https://arxiv.org/pdf/2502.17099">pdf</a>, <a href="https://arxiv.org/format/2502.17099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improved Diffusion-based Generative Model with Better Adversarial Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+M">Mingyang Yi</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+S">Shuchen Xue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Bing Qin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhi-Ming Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17099v1-abstract-short" style="display: inline;"> Diffusion Probabilistic Models (DPMs) have achieved significant success in generative tasks. However, their training and sampling processes suffer from the issue of distribution mismatch. During the denoising process, the input data distributions differ between the training and inference stages, potentially leading to inaccurate data generation. To obviate this, we analyze the training objective o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17099v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17099v1-abstract-full" style="display: none;"> Diffusion Probabilistic Models (DPMs) have achieved significant success in generative tasks. However, their training and sampling processes suffer from the issue of distribution mismatch. During the denoising process, the input data distributions differ between the training and inference stages, potentially leading to inaccurate data generation. To obviate this, we analyze the training objective of DPMs and theoretically demonstrate that this mismatch can be alleviated through Distributionally Robust Optimization (DRO), which is equivalent to performing robustness-driven Adversarial Training (AT) on DPMs. Furthermore, for the recently proposed Consistency Model (CM), which distills the inference process of the DPM, we prove that its training objective also encounters the mismatch issue. Fortunately, this issue can be mitigated by AT as well. Based on these insights, we propose to conduct efficient AT on both DPM and CM. Finally, extensive empirical studies validate the effectiveness of AT in diffusion-based models. The code is available at https://github.com/kugwzk/AT_Diff. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17099v1-abstract-full').style.display = 'none'; document.getElementById('2502.17099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16611">arXiv:2502.16611</a> <span> [<a href="https://arxiv.org/pdf/2502.16611">pdf</a>, <a href="https://arxiv.org/format/2502.16611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Target Speaker Extraction through Comparing Noisy Positive and Negative Audio Enrollments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shitong Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Trigoni%2C+N">Niki Trigoni</a>, <a href="/search/cs?searchtype=author&query=Markham%2C+A">Andrew Markham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16611v1-abstract-short" style="display: inline;"> Target speaker extraction focuses on isolating a specific speaker's voice from an audio mixture containing multiple speakers. To provide information about the target speaker's identity, prior works have utilized clean audio examples as conditioning inputs. However, such clean audio examples are not always readily available (e.g. It is impractical to obtain a clean audio example of a stranger's voi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16611v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16611v1-abstract-full" style="display: none;"> Target speaker extraction focuses on isolating a specific speaker's voice from an audio mixture containing multiple speakers. To provide information about the target speaker's identity, prior works have utilized clean audio examples as conditioning inputs. However, such clean audio examples are not always readily available (e.g. It is impractical to obtain a clean audio example of a stranger's voice at a cocktail party without stepping away from the noisy environment). Limited prior research has explored extracting the target speaker's characteristics from noisy audio examples, which may include overlapping speech from disturbing speakers. In this work, we focus on target speaker extraction when multiple speakers are present during the enrollment stage, through leveraging differences between audio segments where the target speakers are speaking (Positive Enrollments) and segments where they are not (Negative Enrollments). Experiments show the effectiveness of our model architecture and the dedicated pretraining method for the proposed task. Our method achieves state-of-the-art performance in the proposed application settings and demonstrates strong generalizability across challenging and realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16611v1-abstract-full').style.display = 'none'; document.getElementById('2502.16611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 5 figures, appendix included</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15260">arXiv:2502.15260</a> <span> [<a href="https://arxiv.org/pdf/2502.15260">pdf</a>, <a href="https://arxiv.org/format/2502.15260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LightMamba: Efficient Mamba Acceleration on FPGA with Quantization and Hardware Co-design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+R">Renjie Wei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Songqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Linfeng Zhong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zebin Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingyu Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15260v1-abstract-short" style="display: inline;"> State space models (SSMs) like Mamba have recently attracted much attention. Compared to Transformer-based large language models (LLMs), Mamba achieves linear computation complexity with the sequence length and demonstrates superior performance. However, Mamba is hard to accelerate due to the scattered activation outliers and the complex computation dependency, rendering existing LLM accelerators… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15260v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15260v1-abstract-full" style="display: none;"> State space models (SSMs) like Mamba have recently attracted much attention. Compared to Transformer-based large language models (LLMs), Mamba achieves linear computation complexity with the sequence length and demonstrates superior performance. However, Mamba is hard to accelerate due to the scattered activation outliers and the complex computation dependency, rendering existing LLM accelerators inefficient. In this paper, we propose LightMamba that co-designs the quantization algorithm and FPGA accelerator architecture for efficient Mamba inference. We first propose an FPGA-friendly post-training quantization algorithm that features rotation-assisted quantization and power-of-two SSM quantization to reduce the majority of computation to 4-bit. We further design an FPGA accelerator that partially unrolls the Mamba computation to balance the efficiency and hardware costs. Through computation reordering as well as fine-grained tiling and fusion, the hardware utilization and memory efficiency of the accelerator get drastically improved. We implement LightMamba on Xilinx Versal VCK190 FPGA and achieve 4.65x to 6.06x higher energy efficiency over the GPU baseline. When evaluated on Alveo U280 FPGA, LightMamba reaches 93 tokens/s, which is 1.43x that of the GPU baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15260v1-abstract-full').style.display = 'none'; document.getElementById('2502.15260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15177">arXiv:2502.15177</a> <span> [<a href="https://arxiv.org/pdf/2502.15177">pdf</a>, <a href="https://arxiv.org/format/2502.15177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Product Provenance Verification using Data Valuation Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yousuf%2C+R+B">Raquib Bin Yousuf</a>, <a href="/search/cs?searchtype=author&query=Just%2C+H+A">Hoang Anh Just</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengzhe Xu</a>, <a href="/search/cs?searchtype=author&query=Mayer%2C+B">Brian Mayer</a>, <a href="/search/cs?searchtype=author&query=Deklerck%2C+V">Victor Deklerck</a>, <a href="/search/cs?searchtype=author&query=Truszkowski%2C+J">Jakub Truszkowski</a>, <a href="/search/cs?searchtype=author&query=Simeone%2C+J+C">John C. Simeone</a>, <a href="/search/cs?searchtype=author&query=Saunders%2C+J">Jade Saunders</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chang-Tien Lu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+R">Ruoxi Jia</a>, <a href="/search/cs?searchtype=author&query=Ramakrishnan%2C+N">Naren Ramakrishnan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15177v2-abstract-short" style="display: inline;"> Determining and verifying product provenance remains a critical challenge in global supply chains, particularly as geopolitical conflicts and shifting borders create new incentives for misrepresentation of commodities, such as hiding the origin of illegally harvested timber or agriculture grown on illegally cleared land. Stable Isotope Ratio Analysis (SIRA), combined with Gaussian process regressi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15177v2-abstract-full').style.display = 'inline'; document.getElementById('2502.15177v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15177v2-abstract-full" style="display: none;"> Determining and verifying product provenance remains a critical challenge in global supply chains, particularly as geopolitical conflicts and shifting borders create new incentives for misrepresentation of commodities, such as hiding the origin of illegally harvested timber or agriculture grown on illegally cleared land. Stable Isotope Ratio Analysis (SIRA), combined with Gaussian process regression-based isoscapes, has emerged as a powerful tool for geographic origin verification. However, the effectiveness of these models is often constrained by data scarcity and suboptimal dataset selection. In this work, we introduce a novel data valuation framework designed to enhance the selection and utilization of training data for machine learning models applied in SIRA. By prioritizing high-informative samples, our approach improves model robustness and predictive accuracy across diverse datasets and geographies. We validate our methodology with extensive experiments, demonstrating its potential to significantly enhance provenance verification, mitigate fraudulent trade practices, and strengthen regulatory enforcement of global supply chains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15177v2-abstract-full').style.display = 'none'; document.getElementById('2502.15177v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13189">arXiv:2502.13189</a> <span> [<a href="https://arxiv.org/pdf/2502.13189">pdf</a>, <a href="https://arxiv.org/format/2502.13189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MoBA: Mixture of Block Attention for Long-Context LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enzhe Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhejun Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yulun Du</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tao Jiang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C">Chao Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaowei Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Weiran He</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+E">Enming Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiqi Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Huan Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Suting Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinran Xu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+G">Guokun Lai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanru Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huabin Zheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junjie Yan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jianlin Su</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuxin Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N+Y">Neo Y. Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhilin Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Jiezhong Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13189v1-abstract-short" style="display: inline;"> Scaling the effective context length is essential for advancing large language models (LLMs) toward artificial general intelligence (AGI). However, the quadratic increase in computational complexity inherent in traditional attention mechanisms presents a prohibitive overhead. Existing approaches either impose strongly biased structures, such as sink or window attention which are task-specific, or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13189v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13189v1-abstract-full" style="display: none;"> Scaling the effective context length is essential for advancing large language models (LLMs) toward artificial general intelligence (AGI). However, the quadratic increase in computational complexity inherent in traditional attention mechanisms presents a prohibitive overhead. Existing approaches either impose strongly biased structures, such as sink or window attention which are task-specific, or radically modify the attention mechanism into linear approximations, whose performance in complex reasoning tasks remains inadequately explored. In this work, we propose a solution that adheres to the ``less structure'' principle, allowing the model to determine where to attend autonomously, rather than introducing predefined biases. We introduce Mixture of Block Attention (MoBA), an innovative approach that applies the principles of Mixture of Experts (MoE) to the attention mechanism. This novel architecture demonstrates superior performance on long-context tasks while offering a key advantage: the ability to seamlessly transition between full and sparse attention, enhancing efficiency without the risk of compromising performance. MoBA has already been deployed to support Kimi's long-context requests and demonstrates significant advancements in efficient attention computation for LLMs. Our code is available at https://github.com/MoonshotAI/MoBA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13189v1-abstract-full').style.display = 'none'; document.getElementById('2502.13189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11965">arXiv:2502.11965</a> <span> [<a href="https://arxiv.org/pdf/2502.11965">pdf</a>, <a href="https://arxiv.org/format/2502.11965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A MIMO Wireless Channel Foundation Model via CIR-CSI Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jun Jiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenjun Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunfan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11965v2-abstract-short" style="display: inline;"> In the field of artificial intelligence, self-supervised learning has demonstrated superior generalization capabilities by leveraging large-scale unlabeled datasets for pretraining, which is especially critical for wireless communication models to adapt to a variety of scenarios. This paper innovatively treats Channel State Information (CSI) and Channel Impulse Response (CIR) as naturally aligned… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11965v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11965v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11965v2-abstract-full" style="display: none;"> In the field of artificial intelligence, self-supervised learning has demonstrated superior generalization capabilities by leveraging large-scale unlabeled datasets for pretraining, which is especially critical for wireless communication models to adapt to a variety of scenarios. This paper innovatively treats Channel State Information (CSI) and Channel Impulse Response (CIR) as naturally aligned multi-modal data and proposes the first MIMO wireless channel foundation model, named CSI-CLIP. By effectively capturing the joint representations of both CIR and CSI, CSI-CLIP exhibits remarkable adaptability across scenarios and robust feature extraction capabilities. Experimental results show that in positioning task, CSI-CLIP reduces the mean error distance by 22%; in beam management task, it increases accuracy by 1% compared to traditional supervised methods, as well as in the channel identification task. These improvements not only highlight the potential and value of CSI-CLIP in integrating sensing and communication but also demonstrate its significant advantages over existing techniques. Moreover, viewing CSI and CIR as multi-modal pairs and contrastive learning for wireless channel foundation model open up new research directions in the domain of MIMO wireless communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11965v2-abstract-full').style.display = 'none'; document.getElementById('2502.11965v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2025 ICMLCN accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11019">arXiv:2502.11019</a> <span> [<a href="https://arxiv.org/pdf/2502.11019">pdf</a>, <a href="https://arxiv.org/format/2502.11019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the Power of Function Vectors for Characterizing and Mitigating Catastrophic Forgetting in Continual Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+G">Gangwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Caigao Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyi Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+S">Siqiao Xue</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yin Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11019v1-abstract-short" style="display: inline;"> Catastrophic forgetting (CF) poses a significant challenge in machine learning, where a model forgets previously learned information upon learning new tasks. Despite the advanced capabilities of Large Language Models (LLMs), they continue to face challenges with CF during continual learning. The majority of existing research focuses on analyzing forgetting patterns through a singular training sequ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11019v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11019v1-abstract-full" style="display: none;"> Catastrophic forgetting (CF) poses a significant challenge in machine learning, where a model forgets previously learned information upon learning new tasks. Despite the advanced capabilities of Large Language Models (LLMs), they continue to face challenges with CF during continual learning. The majority of existing research focuses on analyzing forgetting patterns through a singular training sequence, thereby overlooking the intricate effects that diverse tasks have on model behavior. Our study explores CF across various settings, discovering that model forgetting is influenced by both the specific training tasks and the models themselves. To this end, we interpret forgetting by examining the function vector (FV), a compact representation of functions in LLMs, offering a model-dependent indicator for the occurrence of CF. Through theoretical and empirical analyses, we demonstrated that CF in LLMs primarily stems from biases in function activation rather than the overwriting of task processing functions. Leveraging these insights, we propose a novel function vector guided training methodology, incorporating a regularization technique to stabilize the FV and mitigate forgetting. Empirical tests on four benchmarks confirm the effectiveness of our proposed training method, substantiating our theoretical framework concerning CF and model function dynamics. We plan to make our code publicly accessible in the near future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11019v1-abstract-full').style.display = 'none'; document.getElementById('2502.11019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10816">arXiv:2502.10816</a> <span> [<a href="https://arxiv.org/pdf/2502.10816">pdf</a>, <a href="https://arxiv.org/format/2502.10816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BalanceBenchmark: A Survey for Multimodal Imbalance Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaoxuan Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+M">Menglu Cui</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chengxiang Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongfa Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Di Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10816v3-abstract-short" style="display: inline;"> Multimodal learning has gained attention for its capacity to integrate information from different modalities. However, it is often hindered by the multimodal imbalance problem, where certain modality dominates while others remain underutilized. Although recent studies have proposed various methods to alleviate this problem, they lack comprehensive and fair comparisons. In this paper, we systematic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10816v3-abstract-full').style.display = 'inline'; document.getElementById('2502.10816v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10816v3-abstract-full" style="display: none;"> Multimodal learning has gained attention for its capacity to integrate information from different modalities. However, it is often hindered by the multimodal imbalance problem, where certain modality dominates while others remain underutilized. Although recent studies have proposed various methods to alleviate this problem, they lack comprehensive and fair comparisons. In this paper, we systematically categorize various mainstream multimodal imbalance algorithms into four groups based on the strategies they employ to mitigate imbalance. To facilitate a comprehensive evaluation of these methods, we introduce BalanceBenchmark, a benchmark including multiple widely used multidimensional datasets and evaluation metrics from three perspectives: performance, imbalance degree, and complexity. To ensure fair comparisons, we have developed a modular and extensible toolkit that standardizes the experimental workflow across different methods. Based on the experiments using BalanceBenchmark, we have identified several key insights into the characteristics and advantages of different method groups in terms of performance, balance degree and computational complexity. We expect such analysis could inspire more efficient approaches to address the imbalance problem in the future, as well as foundation models. The code of the toolkit is available at https://github.com/GeWu-Lab/BalanceBenchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10816v3-abstract-full').style.display = 'none'; document.getElementById('2502.10816v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10739">arXiv:2502.10739</a> <span> [<a href="https://arxiv.org/pdf/2502.10739">pdf</a>, <a href="https://arxiv.org/format/2502.10739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BASE-SQL: A powerful open source Text-To-SQL baseline approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+L">Lei Sheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shuai-Shuai Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10739v1-abstract-short" style="display: inline;"> The conversion of natural language into SQL language for querying databases (Text-to-SQL) has broad application prospects and has attracted widespread attention. At present, the mainstream Text-to-SQL methods are mainly divided into in-context learning (ICL) based methods and supervised fine-tuning (SFT) based methods. ICL-based methods can achieve relatively good results thanks to the use of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10739v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10739v1-abstract-full" style="display: none;"> The conversion of natural language into SQL language for querying databases (Text-to-SQL) has broad application prospects and has attracted widespread attention. At present, the mainstream Text-to-SQL methods are mainly divided into in-context learning (ICL) based methods and supervised fine-tuning (SFT) based methods. ICL-based methods can achieve relatively good results thanks to the use of the most advanced closed-source models. However, in real-world application scenarios, factors such as data privacy, SQL generation efficiency and cost need to be considered. SFT-based methods have certain advantages. At present, methods based on fine-tuning of open source models lack easy-to-implement and effective (cost-effective) baseline methods. We propose a pipeline-based method using open source model fine-tuning, referred to as BASE-SQL, which includes four components: Schema Linking, Candidate SQL Generate, SQL Revision and SQL Merge Revision. Experimental results show that BASE-SQL uses the open source model Qwen2.5-Coder-32B-Instruct, and achieves an accuracy of 67.47% on the BIRD development set and 88.9% on the Spider test set, which is significantly better than other methods using open source models, and even exceeds several methods using the GPT-4o closed-source model. At the same time, BASE-SQL is easy to implement and highly efficient (on average, only five calls to the large language model are required to generate SQL once). The code will be open sourced at https://github.com/CycloneBoy/base_sql. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10739v1-abstract-full').style.display = 'none'; document.getElementById('2502.10739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. 16 pages, 3 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10284">arXiv:2502.10284</a> <span> [<a href="https://arxiv.org/pdf/2502.10284">pdf</a>, <a href="https://arxiv.org/format/2502.10284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701716.3715208">10.1145/3701716.3715208 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Hybrid Cross-Stage Coordination Pre-ranking Model for Online Recommendation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Binglei Zhao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Houying Qi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guang Xu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+F">Feng Mei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sulong Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinghe Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10284v1-abstract-short" style="display: inline;"> Large-scale recommendation systems often adopt cascading architecture consisting of retrieval, pre-ranking, ranking, and re-ranking stages. With strict latency requirements, pre-ranking utilizes lightweight models to perform a preliminary selection from massive retrieved candidates. However, recent works focus solely on improving consistency with ranking, relying exclusively on downstream stages.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10284v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10284v1-abstract-full" style="display: none;"> Large-scale recommendation systems often adopt cascading architecture consisting of retrieval, pre-ranking, ranking, and re-ranking stages. With strict latency requirements, pre-ranking utilizes lightweight models to perform a preliminary selection from massive retrieved candidates. However, recent works focus solely on improving consistency with ranking, relying exclusively on downstream stages. Since downstream input is derived from the pre-ranking output, they will exacerbate the sample selection bias (SSB) issue and Matthew effect, leading to sub-optimal results. To address the limitation, we propose a novel Hybrid Cross-Stage Coordination Pre-ranking model (HCCP) to integrate information from upstream (retrieval) and downstream (ranking, re-ranking) stages. Specifically, cross-stage coordination refers to the pre-ranking's adaptability to the entire stream and the role of serving as a more effective bridge between upstream and downstream. HCCP consists of Hybrid Sample Construction and Hybrid Objective Optimization. Hybrid sample construction captures multi-level unexposed data from the entire stream and rearranges them to become the optimal guiding "ground truth" for pre-ranking learning. Hybrid objective optimization contains the joint optimization of consistency and long-tail precision through our proposed Margin InfoNCE loss. It is specifically designed to learn from such hybrid unexposed samples, improving the overall performance and mitigating the SSB issue. The appendix describes a proof of the efficacy of the proposed loss in selecting potential positives. Extensive offline and online experiments indicate that HCCP outperforms SOTA methods by improving cross-stage coordination. It contributes up to 14.9% UCVR and 1.3% UCTR in the JD E-commerce recommendation system. Concerning code privacy, we provide a pseudocode for reference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10284v1-abstract-full').style.display = 'none'; document.getElementById('2502.10284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2025</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository