Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,554 results for author: <span class="mathjax">Li, W</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14642">arXiv:2502.14642</a> <span> [<a href="https://arxiv.org/pdf/2502.14642">pdf</a>, <a href="https://arxiv.org/format/2502.14642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How Far are LLMs from Being Our Digital Twins? A Benchmark for Persona-Based Behavior Chain Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinfeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+L">Lei Sha</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14642v1-abstract-short" style="display: inline;"> Recently, LLMs have garnered increasing attention across academic disciplines for their potential as human digital twins, virtual proxies designed to replicate individuals and autonomously perform tasks such as decision-making, problem-solving, and reasoning on their behalf. However, current evaluations of LLMs primarily emphasize dialogue simulation while overlooking human behavior simulation, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14642v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14642v1-abstract-full" style="display: none;"> Recently, LLMs have garnered increasing attention across academic disciplines for their potential as human digital twins, virtual proxies designed to replicate individuals and autonomously perform tasks such as decision-making, problem-solving, and reasoning on their behalf. However, current evaluations of LLMs primarily emphasize dialogue simulation while overlooking human behavior simulation, which is crucial for digital twins. To address this gap, we introduce BehaviorChain, the first benchmark for evaluating LLMs' ability to simulate continuous human behavior. BehaviorChain comprises diverse, high-quality, persona-based behavior chains, totaling 15,846 distinct behaviors across 1,001 unique personas, each with detailed history and profile metadata. For evaluation, we integrate persona metadata into LLMs and employ them to iteratively infer contextually appropriate behaviors within dynamic scenarios provided by BehaviorChain. Comprehensive evaluation results demonstrated that even state-of-the-art models struggle with accurately simulating continuous human behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14642v1-abstract-full').style.display = 'none'; document.getElementById('2502.14642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14332">arXiv:2502.14332</a> <span> [<a href="https://arxiv.org/pdf/2502.14332">pdf</a>, <a href="https://arxiv.org/format/2502.14332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Collaborative Jade Recognition System for Mobile Devices Based on Lightweight and Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjia Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pengyu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14332v1-abstract-short" style="display: inline;"> With the widespread adoption and development of mobile devices, vision-based recognition applications have become a hot topic in research. Jade, as an important cultural heritage and artistic item, has significant applications in fields such as jewelry identification and cultural relic preservation. However, existing jade recognition systems still face challenges in mobile implementation, such as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14332v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14332v1-abstract-full" style="display: none;"> With the widespread adoption and development of mobile devices, vision-based recognition applications have become a hot topic in research. Jade, as an important cultural heritage and artistic item, has significant applications in fields such as jewelry identification and cultural relic preservation. However, existing jade recognition systems still face challenges in mobile implementation, such as limited computing resources, real-time requirements, and accuracy issues. To address these challenges, this paper proposes a jade recognition system based on size model collaboration, aiming to achieve efficient and accurate jade identification using mobile devices such as smartphones.First, we design a size model based on multi-scale image processing, extracting key visual information by analyzing jade's dimensions, shapes, and surface textures. Then, a collaborative multi-model classification framework is built by combining deep learning and traditional computer vision algorithms. This framework can effectively select and adjust models based on different jade characteristics, providing high accuracy results across various environments and devices.Experimental results show that the proposed system can provide high recognition accuracy and fast processing time on mobile devices, while consuming relatively low computational resources. The system not only holds great application potential but also provides new ideas and technical support for the intelligent development of jade identification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14332v1-abstract-full').style.display = 'none'; document.getElementById('2502.14332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14276">arXiv:2502.14276</a> <span> [<a href="https://arxiv.org/pdf/2502.14276">pdf</a>, <a href="https://arxiv.org/format/2502.14276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> STeCa: Step-level Trajectory Calibration for LLM Agent Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanlin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14276v1-abstract-short" style="display: inline;"> Large language model (LLM)-based agents have shown promise in tackling complex tasks by interacting dynamically with the environment. Existing work primarily focuses on behavior cloning from expert demonstrations and preference learning through exploratory trajectory sampling. However, these methods often struggle in long-horizon tasks, where suboptimal actions accumulate step by step, causing age… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14276v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14276v1-abstract-full" style="display: none;"> Large language model (LLM)-based agents have shown promise in tackling complex tasks by interacting dynamically with the environment. Existing work primarily focuses on behavior cloning from expert demonstrations and preference learning through exploratory trajectory sampling. However, these methods often struggle in long-horizon tasks, where suboptimal actions accumulate step by step, causing agents to deviate from correct task trajectories. To address this, we highlight the importance of timely calibration and the need to automatically construct calibration trajectories for training agents. We propose Step-Level Trajectory Calibration (STeCa), a novel framework for LLM agent learning. Specifically, STeCa identifies suboptimal actions through a step-level reward comparison during exploration. It constructs calibrated trajectories using LLM-driven reflection, enabling agents to learn from improved decision-making processes. These calibrated trajectories, together with successful trajectory data, are utilized for reinforced training. Extensive experiments demonstrate that STeCa significantly outperforms existing methods. Further analysis highlights that step-level calibration enables agents to complete tasks with greater robustness. Our code and data are available at https://github.com/WangHanLinHenry/STeCa. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14276v1-abstract-full').style.display = 'none'; document.getElementById('2502.14276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14145">arXiv:2502.14145</a> <span> [<a href="https://arxiv.org/pdf/2502.14145">pdf</a>, <a href="https://arxiv.org/format/2502.14145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LLM-Enhanced Dialogue Management for Full-Duplex Spoken Dialogue Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiwei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rilin Chen</a>, <a href="/search/cs?searchtype=author&query=Kothapally%2C+V">Vinay Kothapally</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Meng Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14145v1-abstract-short" style="display: inline;"> Achieving full-duplex communication in spoken dialogue systems (SDS) requires real-time coordination between listening, speaking, and thinking. This paper proposes a semantic voice activity detection (VAD) module as a dialogue manager (DM) to efficiently manage turn-taking in full-duplex SDS. Implemented as a lightweight (0.5B) LLM fine-tuned on full-duplex conversation data, the semantic VAD pred… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14145v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14145v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14145v1-abstract-full" style="display: none;"> Achieving full-duplex communication in spoken dialogue systems (SDS) requires real-time coordination between listening, speaking, and thinking. This paper proposes a semantic voice activity detection (VAD) module as a dialogue manager (DM) to efficiently manage turn-taking in full-duplex SDS. Implemented as a lightweight (0.5B) LLM fine-tuned on full-duplex conversation data, the semantic VAD predicts four control tokens to regulate turn-switching and turn-keeping, distinguishing between intentional and unintentional barge-ins while detecting query completion for handling user pauses and hesitations. By processing input speech in short intervals, the semantic VAD enables real-time decision-making, while the core dialogue engine (CDE) is only activated for response generation, reducing computational overhead. This design allows independent DM optimization without retraining the CDE, balancing interaction accuracy and inference efficiency for scalable, next-generation full-duplex SDS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14145v1-abstract-full').style.display = 'none'; document.getElementById('2502.14145v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission to INTERSPEECH 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13946">arXiv:2502.13946</a> <span> [<a href="https://arxiv.org/pdf/2502.13946">pdf</a>, <a href="https://arxiv.org/format/2502.13946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Why Safeguarded Ships Run Aground? Aligned Large Language Models' Safety Mechanisms Tend to Be Anchored in The Template Region </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13946v1-abstract-short" style="display: inline;"> The safety alignment of large language models (LLMs) remains vulnerable, as their initial behavior can be easily jailbroken by even relatively simple attacks. Since infilling a fixed template between the input instruction and initial model output is a common practice for existing LLMs, we hypothesize that this template is a key factor behind their vulnerabilities: LLMs' safety-related decision-mak… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13946v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13946v1-abstract-full" style="display: none;"> The safety alignment of large language models (LLMs) remains vulnerable, as their initial behavior can be easily jailbroken by even relatively simple attacks. Since infilling a fixed template between the input instruction and initial model output is a common practice for existing LLMs, we hypothesize that this template is a key factor behind their vulnerabilities: LLMs' safety-related decision-making overly relies on the aggregated information from the template region, which largely influences these models' safety behavior. We refer to this issue as template-anchored safety alignment. In this paper, we conduct extensive experiments and verify that template-anchored safety alignment is widespread across various aligned LLMs. Our mechanistic analyses demonstrate how it leads to models' susceptibility when encountering inference-time jailbreak attacks. Furthermore, we show that detaching safety mechanisms from the template region is promising in mitigating vulnerabilities to jailbreak attacks. We encourage future research to develop more robust safety alignment techniques that reduce reliance on the template region. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13946v1-abstract-full').style.display = 'none'; document.getElementById('2502.13946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13925">arXiv:2502.13925</a> <span> [<a href="https://arxiv.org/pdf/2502.13925">pdf</a>, <a href="https://arxiv.org/format/2502.13925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Single Frames: Can LMMs Comprehend Temporal and Contextual Narratives in Image Sequences? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaochen Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jialin Song</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+L">Longyu Guan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yixin Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weiyao Luo</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yifan Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiru Wang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangdi Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13925v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have achieved remarkable success across various visual-language tasks. However, existing benchmarks predominantly focus on single-image understanding, leaving the analysis of image sequences largely unexplored. To address this limitation, we introduce StripCipher, a comprehensive benchmark designed to evaluate capabilities of LMMs to comprehend and reason over sequen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13925v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13925v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have achieved remarkable success across various visual-language tasks. However, existing benchmarks predominantly focus on single-image understanding, leaving the analysis of image sequences largely unexplored. To address this limitation, we introduce StripCipher, a comprehensive benchmark designed to evaluate capabilities of LMMs to comprehend and reason over sequential images. StripCipher comprises a human-annotated dataset and three challenging subtasks: visual narrative comprehension, contextual frame prediction, and temporal narrative reordering. Our evaluation of $16$ state-of-the-art LMMs, including GPT-4o and Qwen2.5VL, reveals a significant performance gap compared to human capabilities, particularly in tasks that require reordering shuffled sequential images. For instance, GPT-4o achieves only 23.93% accuracy in the reordering subtask, which is 56.07% lower than human performance. Further quantitative analysis discuss several factors, such as input format of images, affecting the performance of LLMs in sequential understanding, underscoring the fundamental challenges that remain in the development of LMMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13925v1-abstract-full').style.display = 'none'; document.getElementById('2502.13925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13874">arXiv:2502.13874</a> <span> [<a href="https://arxiv.org/pdf/2502.13874">pdf</a>, <a href="https://arxiv.org/format/2502.13874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> The KnowWhereGraph: A Large-Scale Geo-Knowledge Graph for Interdisciplinary Knowledge Discovery and Geo-Enrichment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Rui Zhu</a>, <a href="/search/cs?searchtype=author&query=Shimizu%2C+C">Cogan Shimizu</a>, <a href="/search/cs?searchtype=author&query=Stephen%2C+S">Shirly Stephen</a>, <a href="/search/cs?searchtype=author&query=Fisher%2C+C+K">Colby K. Fisher</a>, <a href="/search/cs?searchtype=author&query=Thelen%2C+T">Thomas Thelen</a>, <a href="/search/cs?searchtype=author&query=Currier%2C+K">Kitty Currier</a>, <a href="/search/cs?searchtype=author&query=Janowicz%2C+K">Krzysztof Janowicz</a>, <a href="/search/cs?searchtype=author&query=Hitzler%2C+P">Pascal Hitzler</a>, <a href="/search/cs?searchtype=author&query=Schildhauer%2C+M">Mark Schildhauer</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenwen Li</a>, <a href="/search/cs?searchtype=author&query=Rehberger%2C+D">Dean Rehberger</a>, <a href="/search/cs?searchtype=author&query=Barua%2C+A">Adrita Barua</a>, <a href="/search/cs?searchtype=author&query=Christou%2C+A">Antrea Christou</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+L">Ling Cai</a>, <a href="/search/cs?searchtype=author&query=Dalal%2C+A">Abhilekha Dalal</a>, <a href="/search/cs?searchtype=author&query=D%27Onofrio%2C+A">Anthony D'Onofrio</a>, <a href="/search/cs?searchtype=author&query=Eells%2C+A">Andrew Eells</a>, <a href="/search/cs?searchtype=author&query=Faulk%2C+M">Mitchell Faulk</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zilong Liu</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+G">Gengchen Mai</a>, <a href="/search/cs?searchtype=author&query=Mahdavinejad%2C+M+S">Mohammad Saeid Mahdavinejad</a>, <a href="/search/cs?searchtype=author&query=Mecum%2C+B">Bryce Mecum</a>, <a href="/search/cs?searchtype=author&query=Norouzi%2C+S+S">Sanaz Saki Norouzi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+M">Meilin Shi</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuanyuan Tian</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13874v2-abstract-short" style="display: inline;"> Global challenges such as food supply chain disruptions, public health crises, and natural hazard responses require access to and integration of diverse datasets, many of which are geospatial. Over the past few years, a growing number of (geo)portals have been developed to address this need. However, most existing (geo)portals are stacked by separated or sparsely connected data "silos" impeding ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13874v2-abstract-full').style.display = 'inline'; document.getElementById('2502.13874v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13874v2-abstract-full" style="display: none;"> Global challenges such as food supply chain disruptions, public health crises, and natural hazard responses require access to and integration of diverse datasets, many of which are geospatial. Over the past few years, a growing number of (geo)portals have been developed to address this need. However, most existing (geo)portals are stacked by separated or sparsely connected data "silos" impeding effective data consolidation. A new way of sharing and reusing geospatial data is therefore urgently needed. In this work, we introduce KnowWhereGraph, a knowledge graph-based data integration, enrichment, and synthesis framework that not only includes schemas and data related to human and environmental systems but also provides a suite of supporting tools for accessing this information. The KnowWhereGraph aims to address the challenge of data integration by building a large-scale, cross-domain, pre-integrated, FAIR-principles-based, and AI-ready data warehouse rooted in knowledge graphs. We highlight the design principles of KnowWhereGraph, emphasizing the roles of space, place, and time in bridging various data "silos". Additionally, we demonstrate multiple use cases where the proposed geospatial knowledge graph and its associated tools empower decision-makers to uncover insights that are often hidden within complex and poorly interoperable datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13874v2-abstract-full').style.display = 'none'; document.getElementById('2502.13874v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13595">arXiv:2502.13595</a> <span> [<a href="https://arxiv.org/pdf/2502.13595">pdf</a>, <a href="https://arxiv.org/format/2502.13595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MMTEB: Massive Multilingual Text Embedding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Enevoldsen%2C+K">Kenneth Enevoldsen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+I">Isaac Chung</a>, <a href="/search/cs?searchtype=author&query=Kerboua%2C+I">Imene Kerboua</a>, <a href="/search/cs?searchtype=author&query=Kardos%2C+M">M谩rton Kardos</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Ashwin Mathur</a>, <a href="/search/cs?searchtype=author&query=Stap%2C+D">David Stap</a>, <a href="/search/cs?searchtype=author&query=Gala%2C+J">Jay Gala</a>, <a href="/search/cs?searchtype=author&query=Siblini%2C+W">Wissam Siblini</a>, <a href="/search/cs?searchtype=author&query=Krzemi%C5%84ski%2C+D">Dominik Krzemi艅ski</a>, <a href="/search/cs?searchtype=author&query=Winata%2C+G+I">Genta Indra Winata</a>, <a href="/search/cs?searchtype=author&query=Sturua%2C+S">Saba Sturua</a>, <a href="/search/cs?searchtype=author&query=Utpala%2C+S">Saiteja Utpala</a>, <a href="/search/cs?searchtype=author&query=Ciancone%2C+M">Mathieu Ciancone</a>, <a href="/search/cs?searchtype=author&query=Schaeffer%2C+M">Marion Schaeffer</a>, <a href="/search/cs?searchtype=author&query=Sequeira%2C+G">Gabriel Sequeira</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Dhakal%2C+S">Shreeya Dhakal</a>, <a href="/search/cs?searchtype=author&query=Rystr%C3%B8m%2C+J">Jonathan Rystr酶m</a>, <a href="/search/cs?searchtype=author&query=Solomatin%2C+R">Roman Solomatin</a>, <a href="/search/cs?searchtype=author&query=%C3%87a%C4%9Fatan%2C+%C3%96">脰mer 脟a臒atan</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+A">Akash Kundu</a>, <a href="/search/cs?searchtype=author&query=Bernstorff%2C+M">Martin Bernstorff</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Sukhlecha%2C+A">Akshita Sukhlecha</a>, <a href="/search/cs?searchtype=author&query=Pahwa%2C+B">Bhavish Pahwa</a> , et al. (61 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13595v1-abstract-short" style="display: inline;"> Text embeddings are typically evaluated on a limited set of tasks, which are constrained by language, domain, and task diversity. To address these limitations and provide a more comprehensive evaluation, we introduce the Massive Multilingual Text Embedding Benchmark (MMTEB) - a large-scale, community-driven expansion of MTEB, covering over 500 quality-controlled evaluation tasks across 250+ langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13595v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13595v1-abstract-full" style="display: none;"> Text embeddings are typically evaluated on a limited set of tasks, which are constrained by language, domain, and task diversity. To address these limitations and provide a more comprehensive evaluation, we introduce the Massive Multilingual Text Embedding Benchmark (MMTEB) - a large-scale, community-driven expansion of MTEB, covering over 500 quality-controlled evaluation tasks across 250+ languages. MMTEB includes a diverse set of challenging, novel tasks such as instruction following, long-document retrieval, and code retrieval, representing the largest multilingual collection of evaluation tasks for embedding models to date. Using this collection, we develop several highly multilingual benchmarks, which we use to evaluate a representative set of models. We find that while large language models (LLMs) with billions of parameters can achieve state-of-the-art performance on certain language subsets and task categories, the best-performing publicly available model is multilingual-e5-large-instruct with only 560 million parameters. To facilitate accessibility and reduce computational cost, we introduce a novel downsampling method based on inter-task correlation, ensuring a diverse selection while preserving relative model rankings. Furthermore, we optimize tasks such as retrieval by sampling hard negatives, creating smaller but effective splits. These optimizations allow us to introduce benchmarks that drastically reduce computational demands. For instance, our newly introduced zero-shot English benchmark maintains a ranking order similar to the full-scale version but at a fraction of the computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13595v1-abstract-full').style.display = 'none'; document.getElementById('2502.13595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ICLR: https://openreview.net/forum?id=zl3pfz4VCV</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13352">arXiv:2502.13352</a> <span> [<a href="https://arxiv.org/pdf/2502.13352">pdf</a>, <a href="https://arxiv.org/format/2502.13352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Integrated Sensing and Communication for 6G Holographic Digital Twins </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haijun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangnan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haojin Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chen Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13352v1-abstract-short" style="display: inline;"> With the advent of 6G networks, offering ultra-high bandwidth and ultra-low latency, coupled with the enhancement of terminal device resolutions, holographic communication is gradually becoming a reality. Holographic digital twin (HDT) is considered one of key applications of holographic communication, capable of creating virtual replicas for real-time mapping and prediction of physical entity sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13352v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13352v1-abstract-full" style="display: none;"> With the advent of 6G networks, offering ultra-high bandwidth and ultra-low latency, coupled with the enhancement of terminal device resolutions, holographic communication is gradually becoming a reality. Holographic digital twin (HDT) is considered one of key applications of holographic communication, capable of creating virtual replicas for real-time mapping and prediction of physical entity states, and performing three-dimensional reproduction of spatial information. In this context, integrated sensing and communication (ISAC) is expected to be a crucial pathway for providing data sources to HDT. This paper proposes a four-layer architecture assisted by ISAC for HDT, integrating emerging paradigms and key technologies to achieve low-cost, high-precision environmental data collection for constructing HDT. Specifically, to enhance sensing resolution, we explore super-resolution techniques from the perspectives of parameter estimation and point cloud construction. Additionally, we focus on multi-point collaborative sensing for constructing HDT, and provide a comprehensive review of four key techniques: node selection, multi-band collaboration, cooperative beamforming, and data fusion. Finally, we highlight several interesting research directions to guide and inspire future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13352v1-abstract-full').style.display = 'none'; document.getElementById('2502.13352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13311">arXiv:2502.13311</a> <span> [<a href="https://arxiv.org/pdf/2502.13311">pdf</a>, <a href="https://arxiv.org/format/2502.13311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Training Turn-by-Turn Verifiers for Dialogue Tutoring Agents: The Curious Case of LLMs as Your Coding Tutors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yinpei Dai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziqiao Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+J">Joyce Chai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13311v1-abstract-short" style="display: inline;"> Intelligent tutoring agents powered by large language models (LLMs) have been increasingly explored to deliver personalized guidance in areas such as language learning and science education. However, their capabilities in guiding users to solve complex real-world tasks remain underexplored. To address this limitation, in this work, we focus on coding tutoring, a challenging problem that requires t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13311v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13311v1-abstract-full" style="display: none;"> Intelligent tutoring agents powered by large language models (LLMs) have been increasingly explored to deliver personalized guidance in areas such as language learning and science education. However, their capabilities in guiding users to solve complex real-world tasks remain underexplored. To address this limitation, in this work, we focus on coding tutoring, a challenging problem that requires tutors to proactively guide students toward completing predefined coding tasks. We propose a novel agent workflow, Trace-and-Verify (TRAVER), which combines knowledge tracing to estimate a student's knowledge state and turn-by-turn verification to ensure effective guidance toward task completion. We introduce DICT, an automatic evaluation protocol that assesses tutor agents holistically using controlled student simulation and code generation tests. Extensive experiments reveal the challenges of coding tutoring and demonstrate that TRAVER achieves a significantly higher success rate. Although we use code tutoring as an example in this paper, our results and findings can be extended beyond coding, providing valuable insights into advancing tutoring agents for a variety of tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13311v1-abstract-full').style.display = 'none'; document.getElementById('2502.13311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13137">arXiv:2502.13137</a> <span> [<a href="https://arxiv.org/pdf/2502.13137">pdf</a>, <a href="https://arxiv.org/format/2502.13137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Theorem Prover as a Judge for Synthetic Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Leang%2C+J+O+J">Joshua Ong Jun Leang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+G">Giwon Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenda Li</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+S+B">Shay B. Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13137v1-abstract-short" style="display: inline;"> The demand for synthetic data in mathematical reasoning has increased due to its potential to enhance the mathematical capabilities of large language models (LLMs). However, ensuring the validity of intermediate reasoning steps remains a significant challenge, affecting data quality. While formal verification via theorem provers effectively validates LLM reasoning, the autoformalisation of mathema… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13137v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13137v1-abstract-full" style="display: none;"> The demand for synthetic data in mathematical reasoning has increased due to its potential to enhance the mathematical capabilities of large language models (LLMs). However, ensuring the validity of intermediate reasoning steps remains a significant challenge, affecting data quality. While formal verification via theorem provers effectively validates LLM reasoning, the autoformalisation of mathematical proofs remains error-prone. In response, we introduce iterative autoformalisation, an approach that iteratively refines theorem prover formalisation to mitigate errors, thereby increasing the execution rate on the Lean prover from 60% to 87%. Building upon that, we introduce Theorem Prover as a Judge (TP-as-a-Judge), a method that employs theorem prover formalisation to rigorously assess LLM intermediate reasoning, effectively integrating autoformalisation with synthetic data generation. Finally, we present Reinforcement Learning from Theorem Prover Feedback (RLTPF), a framework that replaces human annotation with theorem prover feedback in Reinforcement Learning from Human Feedback (RLHF). Across multiple LLMs, applying TP-as-a-Judge and RLTPF improves benchmarks with only 3,508 samples, achieving 5.56% accuracy gain on Mistral-7B for MultiArith, 6.00% on Llama-2-7B for SVAMP, and 3.55% on Llama-3.1-8B for AQUA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13137v1-abstract-full').style.display = 'none'; document.getElementById('2502.13137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12961">arXiv:2502.12961</a> <span> [<a href="https://arxiv.org/pdf/2502.12961">pdf</a>, <a href="https://arxiv.org/format/2502.12961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Tool Use in Large Language Models with Meta-Cognition Trigger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dexun Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kuicai Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yasheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12961v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable emergent capabilities, transforming the execution of functional tasks by leveraging external tools for complex problems that require specialized processing or real-time data. While existing research expands LLMs access to diverse tools (e.g., program interpreters, search engines, weather/map apps), the necessity of using these tools is often overl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12961v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12961v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable emergent capabilities, transforming the execution of functional tasks by leveraging external tools for complex problems that require specialized processing or real-time data. While existing research expands LLMs access to diverse tools (e.g., program interpreters, search engines, weather/map apps), the necessity of using these tools is often overlooked, leading to indiscriminate tool invocation. This naive approach raises two key issues:(1) increased delays due to unnecessary tool calls, and (2) potential errors resulting from faulty interactions with external tools. In this paper, we introduce meta-cognition as a proxy for LLMs self-assessment of their capabilities, representing the model's awareness of its own limitations. Based on this, we propose MeCo, an adaptive decision-making strategy for external tool use. MeCo quantifies metacognitive scores by capturing high-level cognitive signals in the representation space, guiding when to invoke tools. Notably, MeCo is fine-tuning-free and incurs minimal cost. Our experiments show that MeCo accurately detects LLMs' internal cognitive signals and significantly improves tool-use decision-making across multiple base models and benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12961v1-abstract-full').style.display = 'none'; document.getElementById('2502.12961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12799">arXiv:2502.12799</a> <span> [<a href="https://arxiv.org/pdf/2502.12799">pdf</a>, <a href="https://arxiv.org/format/2502.12799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Towards Text-Image Interleaved Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Ziqi Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanzhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+D">Dingkun Long</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+P">Pengjun Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jun Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12799v1-abstract-short" style="display: inline;"> Current multimodal information retrieval studies mainly focus on single-image inputs, which limits real-world applications involving multiple images and text-image interleaved content. In this work, we introduce the text-image interleaved retrieval (TIIR) task, where the query and document are interleaved text-image sequences, and the model is required to understand the semantics from the interlea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12799v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12799v1-abstract-full" style="display: none;"> Current multimodal information retrieval studies mainly focus on single-image inputs, which limits real-world applications involving multiple images and text-image interleaved content. In this work, we introduce the text-image interleaved retrieval (TIIR) task, where the query and document are interleaved text-image sequences, and the model is required to understand the semantics from the interleaved context for effective retrieval. We construct a TIIR benchmark based on naturally interleaved wikiHow tutorials, where a specific pipeline is designed to generate interleaved queries. To explore the task, we adapt several off-the-shelf retrievers and build a dense baseline by interleaved multimodal large language model (MLLM). We then propose a novel Matryoshka Multimodal Embedder (MME), which compresses the number of visual tokens at different granularity, to address the challenge of excessive visual tokens in MLLM-based TIIR models. Experiments demonstrate that simple adaption of existing models does not consistently yield effective results. Our MME achieves significant improvements over the baseline by substantially fewer visual tokens. We provide extensive analysis and will release the dataset and code to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12799v1-abstract-full').style.display = 'none'; document.getElementById('2502.12799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12658">arXiv:2502.12658</a> <span> [<a href="https://arxiv.org/pdf/2502.12658">pdf</a>, <a href="https://arxiv.org/format/2502.12658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> R.R.: Unveiling LLM Training Privacy through Recollection and Ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+W">Wenlong Meng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhenyuan Guo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lenan Wu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chen Gong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weixian Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chengkun Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenzhi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12658v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) pose significant privacy risks, potentially leaking training data due to implicit memorization. Existing privacy attacks primarily focus on membership inference attacks (MIAs) or data extraction attacks, but reconstructing specific personally identifiable information (PII) in LLM's training data remains challenging. In this paper, we propose R.R. (Recollect and Rank),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12658v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12658v1-abstract-full" style="display: none;"> Large Language Models (LLMs) pose significant privacy risks, potentially leaking training data due to implicit memorization. Existing privacy attacks primarily focus on membership inference attacks (MIAs) or data extraction attacks, but reconstructing specific personally identifiable information (PII) in LLM's training data remains challenging. In this paper, we propose R.R. (Recollect and Rank), a novel two-step privacy stealing attack that enables attackers to reconstruct PII entities from scrubbed training data where the PII entities have been masked. In the first stage, we introduce a prompt paradigm named recollection, which instructs the LLM to repeat a masked text but fill in masks. Then we can use PII identifiers to extract recollected PII candidates. In the second stage, we design a new criterion to score each PII candidate and rank them. Motivated by membership inference, we leverage the reference model as a calibration to our criterion. Experiments across three popular PII datasets demonstrate that the R.R. achieves better PII identical performance compared to baselines. These results highlight the vulnerability of LLMs to PII leakage even when training data has been scrubbed. We release the replicate package of R.R. at a link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12658v1-abstract-full').style.display = 'none'; document.getElementById('2502.12658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12067">arXiv:2502.12067</a> <span> [<a href="https://arxiv.org/pdf/2502.12067">pdf</a>, <a href="https://arxiv.org/format/2502.12067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TokenSkip: Controllable Chain-of-Thought Compression in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12067v1-abstract-short" style="display: inline;"> Chain-of-Thought (CoT) has been proven effective in enhancing the reasoning capabilities of large language models (LLMs). Recent advancements, such as OpenAI's o1 and DeepSeek-R1, suggest that scaling up the length of CoT sequences during inference could further boost LLM reasoning performance. However, due to the autoregressive nature of LLM decoding, longer CoT outputs lead to a linear increase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12067v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12067v1-abstract-full" style="display: none;"> Chain-of-Thought (CoT) has been proven effective in enhancing the reasoning capabilities of large language models (LLMs). Recent advancements, such as OpenAI's o1 and DeepSeek-R1, suggest that scaling up the length of CoT sequences during inference could further boost LLM reasoning performance. However, due to the autoregressive nature of LLM decoding, longer CoT outputs lead to a linear increase in inference latency, adversely affecting user experience, particularly when the CoT exceeds 10,000 tokens. To address this limitation, we analyze the semantic importance of tokens within CoT outputs and reveal that their contributions to reasoning vary. Building on this insight, we propose TokenSkip, a simple yet effective approach that enables LLMs to selectively skip less important tokens, allowing for controllable CoT compression. Extensive experiments across various models and tasks demonstrate the effectiveness of TokenSkip in reducing CoT token usage while preserving strong reasoning performance. Notably, when applied to Qwen2.5-14B-Instruct, TokenSkip reduces reasoning tokens by 40% (from 313 to 181) on GSM8K, with less than a 0.4% performance drop. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12067v1-abstract-full').style.display = 'none'; document.getElementById('2502.12067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11775">arXiv:2502.11775</a> <span> [<a href="https://arxiv.org/pdf/2502.11775">pdf</a>, <a href="https://arxiv.org/format/2502.11775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> video-SALMONN-o1: Reasoning-enhanced Audio-visual Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jimin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Changli Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=MA%2C+Z">Zejun MA</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11775v1-abstract-short" style="display: inline;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11775v1-abstract-full" style="display: none;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhanced audio-visual LLM designed for general video understanding tasks. To enhance its reasoning abilities, we develop a reasoning-intensive dataset featuring challenging audio-visual questions with step-by-step solutions. We also propose process direct preference optimization (pDPO), which leverages contrastive step selection to achieve efficient step-level reward modelling tailored for multimodal inputs. Additionally, we introduce RivaBench, the first reasoning-intensive video understanding benchmark, featuring over 4,000 high-quality, expert-curated question-answer pairs across scenarios such as standup comedy, academic presentations, and synthetic video detection. video-SALMONN-o1 achieves 3-8% accuracy improvements over the LLaVA-OneVision baseline across different video reasoning benchmarks. Besides, pDPO achieves 6-8% improvements compared to the supervised fine-tuning model on RivaBench. Enhanced reasoning enables video-SALMONN-o1 zero-shot synthetic video detection capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'none'; document.getElementById('2502.11775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11726">arXiv:2502.11726</a> <span> [<a href="https://arxiv.org/pdf/2502.11726">pdf</a>, <a href="https://arxiv.org/format/2502.11726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.cag.2025.104176">10.1016/j.cag.2025.104176 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> No-reference geometry quality assessment for colorless point clouds via list-wise rank learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Bingxu Xie</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+C">Chao Chu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiqing Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhiyong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11726v1-abstract-short" style="display: inline;"> Geometry quality assessment (GQA) of colorless point clouds is crucial for evaluating the performance of emerging point cloud-based solutions (e.g., watermarking, compression, and 3-Dimensional (3D) reconstruction). Unfortunately, existing objective GQA approaches are traditional full-reference metrics, whereas state-of-the-art learning-based point cloud quality assessment (PCQA) methods target bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11726v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11726v1-abstract-full" style="display: none;"> Geometry quality assessment (GQA) of colorless point clouds is crucial for evaluating the performance of emerging point cloud-based solutions (e.g., watermarking, compression, and 3-Dimensional (3D) reconstruction). Unfortunately, existing objective GQA approaches are traditional full-reference metrics, whereas state-of-the-art learning-based point cloud quality assessment (PCQA) methods target both color and geometry distortions, neither of which are qualified for the no-reference GQA task. In addition, the lack of large-scale GQA datasets with subjective scores, which are always imprecise, biased, and inconsistent, also hinders the development of learning-based GQA metrics. Driven by these limitations, this paper proposes a no-reference geometry-only quality assessment approach based on list-wise rank learning, termed LRL-GQA, which comprises of a geometry quality assessment network (GQANet) and a list-wise rank learning network (LRLNet). The proposed LRL-GQA formulates the no-reference GQA as a list-wise rank problem, with the objective of directly optimizing the entire quality ordering. Specifically, a large dataset containing a variety of geometry-only distortions is constructed first, named LRL dataset, in which each sample is label-free but coupled with quality ranking information. Then, the GQANet is designed to capture intrinsic multi-scale patch-wise geometric features in order to predict a quality index for each point cloud. After that, the LRLNet leverages the LRL dataset and a likelihood loss to train the GQANet and ranks the input list of degraded point clouds according to their distortion levels. In addition, the pre-trained GQANet can be fine-tuned further to obtain absolute quality scores. Experimental results demonstrate the superior performance of the proposed no-reference LRL-GQA method compared with existing full-reference GQA metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11726v1-abstract-full').style.display = 'none'; document.getElementById('2502.11726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computers & Graphics, Volume 127, April 2025, 104176 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11710">arXiv:2502.11710</a> <span> [<a href="https://arxiv.org/pdf/2502.11710">pdf</a>, <a href="https://arxiv.org/format/2502.11710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2025.3541445">10.1109/TCSVT.2025.3541445 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The Worse The Better: Content-Aware Viewpoint Generation Network for Projection-related Point Cloud Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhiyong Su</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Bingxu Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jincan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiqing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11710v1-abstract-short" style="display: inline;"> Through experimental studies, however, we observed the instability of final predicted quality scores, which change significantly over different viewpoint settings. Inspired by the "wooden barrel theory", given the default content-independent viewpoints of existing projection-related PCQA approaches, this paper presents a novel content-aware viewpoint generation network (CAVGN) to learn better view… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11710v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11710v1-abstract-full" style="display: none;"> Through experimental studies, however, we observed the instability of final predicted quality scores, which change significantly over different viewpoint settings. Inspired by the "wooden barrel theory", given the default content-independent viewpoints of existing projection-related PCQA approaches, this paper presents a novel content-aware viewpoint generation network (CAVGN) to learn better viewpoints by taking the distribution of geometric and attribute features of degraded point clouds into consideration. Firstly, the proposed CAVGN extracts multi-scale geometric and texture features of the entire input point cloud, respectively. Then, for each default content-independent viewpoint, the extracted geometric and texture features are refined to focus on its corresponding visible part of the input point cloud. Finally, the refined geometric and texture features are concatenated to generate an optimized viewpoint. To train the proposed CAVGN, we present a self-supervised viewpoint ranking network (SSVRN) to select the viewpoint with the worst quality projected image to construct a default-optimized viewpoint dataset, which consists of thousands of paired default viewpoints and corresponding optimized viewpoints. Experimental results show that the projection-related PCQA methods can achieve higher performance using the viewpoints generated by the proposed CAVGN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11710v1-abstract-full').style.display = 'none'; document.getElementById('2502.11710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in IEEE Transactions on Circuits and Systems for Video Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11607">arXiv:2502.11607</a> <span> [<a href="https://arxiv.org/pdf/2502.11607">pdf</a>, <a href="https://arxiv.org/format/2502.11607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GraphThought: Graph Combinatorial Optimization with Thought Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zixiao Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lifeng Guo</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+J">Junjie Sheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haosheng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Changhong Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11607v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable capabilities across various domains, especially in text processing and generative tasks. Recent advancements in the reasoning capabilities of state-of-the-art LLMs, such as OpenAI-o1, have significantly broadened their applicability, particularly in complex problem-solving and logical inference. However, most existing LLMs struggle with not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11607v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11607v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11607v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable capabilities across various domains, especially in text processing and generative tasks. Recent advancements in the reasoning capabilities of state-of-the-art LLMs, such as OpenAI-o1, have significantly broadened their applicability, particularly in complex problem-solving and logical inference. However, most existing LLMs struggle with notable limitations in handling graph combinatorial optimization (GCO) problems. To bridge this gap, we formally define the Optimal Thoughts Design (OTD) problem, including its state and action thought space. We then introduce a novel framework, GraphThought, designed to generate high-quality thought datasets for GCO problems. Leveraging these datasets, we fine-tune the Llama-3-8B-Instruct model to develop Llama-GT. Notably, despite its compact 8B-parameter architecture, Llama-GT matches the performance of state-of-the-art LLMs on the GraphArena benchmark. Experimental results show that our approach outperforms both proprietary and open-source models, even rivaling specialized models like o1-mini. This work sets a new state-of-the-art benchmark while challenging the prevailing notion that model scale is the primary driver of reasoning capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11607v1-abstract-full').style.display = 'none'; document.getElementById('2502.11607v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">41 pages, 5 figures, 13 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11560">arXiv:2502.11560</a> <span> [<a href="https://arxiv.org/pdf/2502.11560">pdf</a>, <a href="https://arxiv.org/format/2502.11560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Automatic Prompt Engineering: An Optimization Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenwu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11560v1-abstract-short" style="display: inline;"> The rise of foundation models has shifted focus from resource-intensive fine-tuning to prompt engineering, a paradigm that steers model behavior through input design rather than weight updates. While manual prompt engineering faces limitations in scalability, adaptability, and cross-modal alignment, automated methods, spanning foundation model (FM) based optimization, evolutionary methods, gradien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11560v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11560v1-abstract-full" style="display: none;"> The rise of foundation models has shifted focus from resource-intensive fine-tuning to prompt engineering, a paradigm that steers model behavior through input design rather than weight updates. While manual prompt engineering faces limitations in scalability, adaptability, and cross-modal alignment, automated methods, spanning foundation model (FM) based optimization, evolutionary methods, gradient-based optimization, and reinforcement learning, offer promising solutions. Existing surveys, however, remain fragmented across modalities and methodologies. This paper presents the first comprehensive survey on automated prompt engineering through a unified optimization-theoretic lens. We formalize prompt optimization as a maximization problem over discrete, continuous, and hybrid prompt spaces, systematically organizing methods by their optimization variables (instructions, soft prompts, exemplars), task-specific objectives, and computational frameworks. By bridging theoretical formulation with practical implementations across text, vision, and multimodal domains, this survey establishes a foundational framework for both researchers and practitioners, while highlighting underexplored frontiers in constrained optimization and agent-oriented prompt design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11560v1-abstract-full').style.display = 'none'; document.getElementById('2502.11560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11518">arXiv:2502.11518</a> <span> [<a href="https://arxiv.org/pdf/2502.11518">pdf</a>, <a href="https://arxiv.org/format/2502.11518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative Multi-Agent Collaboration in Embodied AI: A Systematic Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xian Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guang Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hao Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11518v1-abstract-short" style="display: inline;"> Embodied multi-agent systems (EMAS) have attracted growing attention for their potential to address complex, real-world challenges in areas such as logistics and robotics. Recent advances in foundation models pave the way for generative agents capable of richer communication and adaptive problem-solving. This survey provides a systematic examination of how EMAS can benefit from these generative ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11518v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11518v1-abstract-full" style="display: none;"> Embodied multi-agent systems (EMAS) have attracted growing attention for their potential to address complex, real-world challenges in areas such as logistics and robotics. Recent advances in foundation models pave the way for generative agents capable of richer communication and adaptive problem-solving. This survey provides a systematic examination of how EMAS can benefit from these generative capabilities. We propose a taxonomy that categorizes EMAS by system architectures and embodiment modalities, emphasizing how collaboration spans both physical and virtual contexts. Central building blocks, perception, planning, communication, and feedback, are then analyzed to illustrate how generative techniques bolster system robustness and flexibility. Through concrete examples, we demonstrate the transformative effects of integrating foundation models into embodied, multi-agent frameworks. Finally, we discuss challenges and future directions, underlining the significant promise of EMAS to reshape the landscape of AI-driven collaboration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11518v1-abstract-full').style.display = 'none'; document.getElementById('2502.11518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11501">arXiv:2502.11501</a> <span> [<a href="https://arxiv.org/pdf/2502.11501">pdf</a>, <a href="https://arxiv.org/format/2502.11501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Token Pruning in Multimodal Large Language Models: Are We Solving the Right Problem? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zichen Wen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijia Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11501v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have shown remarkable performance for cross-modal understanding and generation, yet still suffer from severe inference costs. Recently, abundant works have been proposed to solve this problem with token pruning, which identifies the redundant tokens in MLLMs and then prunes them to reduce the computation and KV storage costs, leading to significant accelera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11501v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11501v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have shown remarkable performance for cross-modal understanding and generation, yet still suffer from severe inference costs. Recently, abundant works have been proposed to solve this problem with token pruning, which identifies the redundant tokens in MLLMs and then prunes them to reduce the computation and KV storage costs, leading to significant acceleration without training. While these methods claim efficiency gains, critical questions about their fundamental design and evaluation remain unanswered: Why do many existing approaches underperform even compared to naive random token selection? Are attention-based scoring sufficient for reliably identifying redundant tokens? Is language information really helpful during token pruning? What makes a good trade-off between token importance and duplication? Are current evaluation protocols comprehensive and unbiased? The ignorance of previous research on these problems hinders the long-term development of token pruning. In this paper, we answer these questions one by one, providing insights into the design of future token pruning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11501v1-abstract-full').style.display = 'none'; document.getElementById('2502.11501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11494">arXiv:2502.11494</a> <span> [<a href="https://arxiv.org/pdf/2502.11494">pdf</a>, <a href="https://arxiv.org/format/2502.11494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stop Looking for Important Tokens in Multimodal Language Models: Duplication Matters More </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zichen Wen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaobo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qintong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijia Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11494v1-abstract-short" style="display: inline;"> Vision tokens in multimodal large language models often dominate huge computational overhead due to their excessive length compared to linguistic modality. Abundant recent methods aim to solve this problem with token pruning, which first defines an importance criterion for tokens and then prunes the unimportant vision tokens during inference. However, in this paper, we show that the importance is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11494v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11494v1-abstract-full" style="display: none;"> Vision tokens in multimodal large language models often dominate huge computational overhead due to their excessive length compared to linguistic modality. Abundant recent methods aim to solve this problem with token pruning, which first defines an importance criterion for tokens and then prunes the unimportant vision tokens during inference. However, in this paper, we show that the importance is not an ideal indicator to decide whether a token should be pruned. Surprisingly, it usually results in inferior performance than random token pruning and leading to incompatibility to efficient attention computation operators.Instead, we propose DART (Duplication-Aware Reduction of Tokens), which prunes tokens based on its duplication with other tokens, leading to significant and training-free acceleration. Concretely, DART selects a small subset of pivot tokens and then retains the tokens with low duplication to the pivots, ensuring minimal information loss during token pruning. Experiments demonstrate that DART can prune 88.9% vision tokens while maintaining comparable performance, leading to a 1.99$\times$ and 2.99$\times$ speed-up in total time and prefilling stage, respectively, with good compatibility to efficient attention operators. Our codes are available at https://github.com/ZichenWen1/DART. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11494v1-abstract-full').style.display = 'none'; document.getElementById('2502.11494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11471">arXiv:2502.11471</a> <span> [<a href="https://arxiv.org/pdf/2502.11471">pdf</a>, <a href="https://arxiv.org/format/2502.11471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> GLTW: Joint Improved Graph Transformer and LLM via Three-Word Language for Knowledge Graph Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+K">Kangyang Luo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuzhuo Bai</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cheng Gao</a>, <a href="/search/cs?searchtype=author&query=Si%2C+S">Shuzheng Si</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yingli Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhitong Wang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Cunliang Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yufei Huang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Ye Tian</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xuantang Xiong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Lei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11471v1-abstract-short" style="display: inline;"> Knowledge Graph Completion (KGC), which aims to infer missing or incomplete facts, is a crucial task for KGs. However, integrating the vital structural information of KGs into Large Language Models (LLMs) and outputting predictions deterministically remains challenging. To address this, we propose a new method called GLTW, which encodes the structural information of KGs and merges it with LLMs to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11471v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11471v1-abstract-full" style="display: none;"> Knowledge Graph Completion (KGC), which aims to infer missing or incomplete facts, is a crucial task for KGs. However, integrating the vital structural information of KGs into Large Language Models (LLMs) and outputting predictions deterministically remains challenging. To address this, we propose a new method called GLTW, which encodes the structural information of KGs and merges it with LLMs to enhance KGC performance. Specifically, we introduce an improved Graph Transformer (iGT) that effectively encodes subgraphs with both local and global structural information and inherits the characteristics of language model, bypassing training from scratch. Also, we develop a subgraph-based multi-classification training objective, using all entities within KG as classification objects, to boost learning efficiency.Importantly, we combine iGT with an LLM that takes KG language prompts as input.Our extensive experiments on various KG datasets show that GLTW achieves significant performance gains compared to SOTA baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11471v1-abstract-full').style.display = 'none'; document.getElementById('2502.11471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10705">arXiv:2502.10705</a> <span> [<a href="https://arxiv.org/pdf/2502.10705">pdf</a>, <a href="https://arxiv.org/format/2502.10705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoPEFT: Fast Adaptation Framework for Multi-Agent Collaborative Perception with Parameter-Efficient Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Quanmin Wei</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+P">Penglin Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiao Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10705v1-abstract-short" style="display: inline;"> Multi-agent collaborative perception is expected to significantly improve perception performance by overcoming the limitations of single-agent perception through exchanging complementary information. However, training a robust collaborative perception model requires collecting sufficient training data that covers all possible collaboration scenarios, which is impractical due to intolerable deploym… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10705v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10705v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10705v1-abstract-full" style="display: none;"> Multi-agent collaborative perception is expected to significantly improve perception performance by overcoming the limitations of single-agent perception through exchanging complementary information. However, training a robust collaborative perception model requires collecting sufficient training data that covers all possible collaboration scenarios, which is impractical due to intolerable deployment costs. Hence, the trained model is not robust against new traffic scenarios with inconsistent data distribution and fundamentally restricts its real-world applicability. Further, existing methods, such as domain adaptation, have mitigated this issue by exposing the deployment data during the training stage but incur a high training cost, which is infeasible for resource-constrained agents. In this paper, we propose a Parameter-Efficient Fine-Tuning-based lightweight framework, CoPEFT, for fast adapting a trained collaborative perception model to new deployment environments under low-cost conditions. CoPEFT develops a Collaboration Adapter and Agent Prompt to perform macro-level and micro-level adaptations separately. Specifically, the Collaboration Adapter utilizes the inherent knowledge from training data and limited deployment data to adapt the feature map to new data distribution. The Agent Prompt further enhances the Collaboration Adapter by inserting fine-grained contextual information about the environment. Extensive experiments demonstrate that our CoPEFT surpasses existing methods with less than 1\% trainable parameters, proving the effectiveness and efficiency of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10705v1-abstract-full').style.display = 'none'; document.getElementById('2502.10705v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 39th AAAI Conference on Artificial Intelligence (AAAI-25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09873">arXiv:2502.09873</a> <span> [<a href="https://arxiv.org/pdf/2502.09873">pdf</a>, <a href="https://arxiv.org/format/2502.09873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Compression-Aware One-Step Diffusion Model for JPEG Artifact Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinpei Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zheng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09873v2-abstract-short" style="display: inline;"> Diffusion models have demonstrated remarkable success in image restoration tasks. However, their multi-step denoising process introduces significant computational overhead, limiting their practical deployment. Furthermore, existing methods struggle to effectively remove severe JPEG artifact, especially in highly compressed images. To address these challenges, we propose CODiff, a compression-aware… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09873v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09873v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09873v2-abstract-full" style="display: none;"> Diffusion models have demonstrated remarkable success in image restoration tasks. However, their multi-step denoising process introduces significant computational overhead, limiting their practical deployment. Furthermore, existing methods struggle to effectively remove severe JPEG artifact, especially in highly compressed images. To address these challenges, we propose CODiff, a compression-aware one-step diffusion model for JPEG artifact removal. The core of CODiff is the compression-aware visual embedder (CaVE), which extracts and leverages JPEG compression priors to guide the diffusion model. We propose a dual learning strategy that combines explicit and implicit learning. Specifically, explicit learning enforces a quality prediction objective to differentiate low-quality images with different compression levels. Implicit learning employs a reconstruction objective that enhances the model's generalization. This dual learning allows for a deeper and more comprehensive understanding of JPEG compression. Experimental results demonstrate that CODiff surpasses recent leading methods in both quantitative and visual quality metrics. The code and models will be released at https://github.com/jp-guo/CODiff. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09873v2-abstract-full').style.display = 'none'; document.getElementById('2502.09873v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09869">arXiv:2502.09869</a> <span> [<a href="https://arxiv.org/pdf/2502.09869">pdf</a>, <a href="https://arxiv.org/format/2502.09869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Beyond Explicit and Implicit: How Users Provide Feedback to Shape Personalized Recommendation Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenqi Li</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+J">Jui-Ching Kuo</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+M">Manyu Sheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qunfang Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09869v1-abstract-short" style="display: inline;"> As personalized recommendation algorithms become integral to social media platforms, users are increasingly aware of their ability to influence recommendation content. However, limited research has explored how users provide feedback through their behaviors and platform mechanisms to shape the recommendation content. We conducted semi-structured interviews with 34 active users of algorithmic-drive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09869v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09869v1-abstract-full" style="display: none;"> As personalized recommendation algorithms become integral to social media platforms, users are increasingly aware of their ability to influence recommendation content. However, limited research has explored how users provide feedback through their behaviors and platform mechanisms to shape the recommendation content. We conducted semi-structured interviews with 34 active users of algorithmic-driven social media platforms (e.g., Xiaohongshu, Douyin). In addition to explicit and implicit feedback, this study introduced intentional implicit feedback, highlighting the actions users intentionally took to refine recommendation content through perceived feedback mechanisms. Additionally, choices of feedback behaviors were found to align with specific purposes. Explicit feedback was primarily used for feed customization, while unintentional implicit feedback was more linked to content consumption. Intentional implicit feedback was employed for multiple purposes, particularly in increasing content diversity and improving recommendation relevance. This work underscores the user intention dimension in the explicit-implicit feedback dichotomy and offers insights for designing personalized recommendation feedback that better responds to users' needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09869v1-abstract-full').style.display = 'none'; document.getElementById('2502.09869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The final version is available at https://doi.org/10.1145/3706598.3713241</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09762">arXiv:2502.09762</a> <span> [<a href="https://arxiv.org/pdf/2502.09762">pdf</a>, <a href="https://arxiv.org/format/2502.09762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Teaming in Multi-Drone Pursuit: Simulation, Training, and Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junfan Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+F">Feng Xue</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Jiabin Qiu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Ying Wen</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+W">Wei Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09762v1-abstract-short" style="display: inline;"> Adaptive teaming, the ability to collaborate with unseen teammates without prior coordination, remains an underexplored challenge in multi-robot collaboration. This paper focuses on adaptive teaming in multi-drone cooperative pursuit, a critical task with real-world applications such as border surveillance, search-and-rescue, and counter-terrorism. We first define and formalize the \textbf{A}dapti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09762v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09762v1-abstract-full" style="display: none;"> Adaptive teaming, the ability to collaborate with unseen teammates without prior coordination, remains an underexplored challenge in multi-robot collaboration. This paper focuses on adaptive teaming in multi-drone cooperative pursuit, a critical task with real-world applications such as border surveillance, search-and-rescue, and counter-terrorism. We first define and formalize the \textbf{A}daptive Teaming in \textbf{M}ulti-\textbf{D}rone \textbf{P}ursuit (AT-MDP) problem and introduce AT-MDP framework, a comprehensive framework that integrates simulation, algorithm training and real-world deployment. AT-MDP framework provides a flexible experiment configurator and interface for simulation, a distributed training framework with an extensive algorithm zoo (including two newly proposed baseline methods) and an unseen drone zoo for evaluating adaptive teaming, as well as a real-world deployment system that utilizes edge computing and Crazyflie drones. To the best of our knowledge, AT-MDP framework is the first adaptive framework for continuous-action decision-making in complex real-world drone tasks, enabling multiple drones to coordinate effectively with unseen teammates. Extensive experiments in four multi-drone pursuit environments of increasing difficulty confirm the effectiveness of AT-MDP framework, while real-world deployments further validate its feasibility in physical systems. Videos and code are available at https://sites.google.com/view/at-mdp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09762v1-abstract-full').style.display = 'none'; document.getElementById('2502.09762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09412">arXiv:2502.09412</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> A LP-rounding based algorithm for soft capacitated facility location problem with submodular penalties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hanyin Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhikang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weidong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09412v2-abstract-short" style="display: inline;"> The soft capacitated facility location problem (SCFLP) is a classic combinatorial optimization problem, with its variants widely applied in the fields of operations research and computer science. In the SCFLP, given a set $\mathcal{F}$ of facilities and a set $\mathcal{D}$ of clients, each facility has a capacity and an open cost, allowing to open multiple times, and each client has a demand. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09412v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09412v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09412v2-abstract-full" style="display: none;"> The soft capacitated facility location problem (SCFLP) is a classic combinatorial optimization problem, with its variants widely applied in the fields of operations research and computer science. In the SCFLP, given a set $\mathcal{F}$ of facilities and a set $\mathcal{D}$ of clients, each facility has a capacity and an open cost, allowing to open multiple times, and each client has a demand. This problem is to find a subset of facilities in $\mathcal{F}$ and connect each client to the facilities opened, such that the total cost including open cost and connection cost is minimied. SCFLP is a NP-hard problem, which has led to a focus on approximation algorithms. Based on this, we consider a variant, that is, soft capacitated facility location problem with submodular penalties (SCFLPSP), which allows some clients not to be served by accepting the penalty cost. And we consider the integer splittable case of demand, that is, the demand of each client is served by multiple facilities with the integer service amount by each facility. Based on LP-rounding, we propose a $(位R+4)$-approximation algorithm, where $R=\frac{\max_{i \in \mathcal{F} }f_i}{\min_{i \in \mathcal{F} }f_i},位=\frac{R+\sqrt{R^2+8R}}{2R}$. In particular, when the open cost is uniform, the approximation ratio is 6. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09412v2-abstract-full').style.display = 'none'; document.getElementById('2502.09412v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We have not yet fully reconciled the paper and need to update parts of it</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09014">arXiv:2502.09014</a> <span> [<a href="https://arxiv.org/pdf/2502.09014">pdf</a>, <a href="https://arxiv.org/format/2502.09014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Optimal Contest Design with Entry Restriction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanbing Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Ningyuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weian Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Changyuan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09014v1-abstract-short" style="display: inline;"> This paper explores the design of contests involving $n$ contestants, focusing on how the designer decides on the number of contestants allowed and the prize structure with a fixed budget. We characterize the unique symmetric Bayesian Nash equilibrium of contestants and find the optimal contests design for the maximum individual effort objective and the total effort objective. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09014v1-abstract-full" style="display: none;"> This paper explores the design of contests involving $n$ contestants, focusing on how the designer decides on the number of contestants allowed and the prize structure with a fixed budget. We characterize the unique symmetric Bayesian Nash equilibrium of contestants and find the optimal contests design for the maximum individual effort objective and the total effort objective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09014v1-abstract-full').style.display = 'none'; document.getElementById('2502.09014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08969">arXiv:2502.08969</a> <span> [<a href="https://arxiv.org/pdf/2502.08969">pdf</a>, <a href="https://arxiv.org/format/2502.08969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> SkyRover: A Modular Simulator for Cross-Domain Pathfinding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenhui Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Changhong Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08969v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicles (UAVs) and Automated Guided Vehicles (AGVs) increasingly collaborate in logistics, surveillance, inspection tasks and etc. However, existing simulators often focus on a single domain, limiting cross-domain study. This paper presents the SkyRover, a modular simulator for UAV-AGV multi-agent pathfinding (MAPF). SkyRover supports realistic agent dynamics, configurable 3D envi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08969v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08969v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicles (UAVs) and Automated Guided Vehicles (AGVs) increasingly collaborate in logistics, surveillance, inspection tasks and etc. However, existing simulators often focus on a single domain, limiting cross-domain study. This paper presents the SkyRover, a modular simulator for UAV-AGV multi-agent pathfinding (MAPF). SkyRover supports realistic agent dynamics, configurable 3D environments, and convenient APIs for external solvers and learning methods. By unifying ground and aerial operations, it facilitates cross-domain algorithm design, testing, and benchmarking. Experiments highlight SkyRover's capacity for efficient pathfinding and high-fidelity simulations in UAV-AGV coordination. Project is available at https://sites.google.com/view/mapf3d/home. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08969v1-abstract-full').style.display = 'none'; document.getElementById('2502.08969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08507">arXiv:2502.08507</a> <span> [<a href="https://arxiv.org/pdf/2502.08507">pdf</a>, <a href="https://arxiv.org/format/2502.08507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Explanation based In-Context Demonstrations Retrieval for Multilingual Grammatical Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wen Luo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+G">Guangyue Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Houfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08507v1-abstract-short" style="display: inline;"> Grammatical error correction (GEC) aims to correct grammatical, spelling, and semantic errors in natural language text. With the growing of large language models (LLMs), direct text generation has gradually become the focus of the GEC methods, and few-shot in-context learning presents a cost-effective solution. However, selecting effective in-context examples remains challenging, as the similarity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08507v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08507v1-abstract-full" style="display: none;"> Grammatical error correction (GEC) aims to correct grammatical, spelling, and semantic errors in natural language text. With the growing of large language models (LLMs), direct text generation has gradually become the focus of the GEC methods, and few-shot in-context learning presents a cost-effective solution. However, selecting effective in-context examples remains challenging, as the similarity between input texts does not necessarily correspond to similar grammatical error patterns. In this paper, we propose a novel retrieval method based on natural language grammatical error explanations (GEE) to address this issue. Our method retrieves suitable few-shot demonstrations by matching the GEE of the test input with that of pre-constructed database samples, where explanations for erroneous samples are generated by LLMs. We conducted multilingual GEC few-shot experiments on both major open-source and closed-source LLMs. Experiments across five languages show that our method outperforms existing semantic and BM25-based retrieval techniques, without requiring additional training or language adaptation. This also suggests that matching error patterns is key to selecting examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08507v1-abstract-full').style.display = 'none'; document.getElementById('2502.08507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07288">arXiv:2502.07288</a> <span> [<a href="https://arxiv.org/pdf/2502.07288">pdf</a>, <a href="https://arxiv.org/format/2502.07288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KPIs 2024 Challenge: Advancing Glomerular Segmentation from Patch- to Slide-Level </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junlin Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Siqi Lu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lining Yu</a>, <a href="/search/cs?searchtype=author&query=Cap%2C+Q+H">Quan Huu Cap</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pengzhou Cai</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+L">Libin Lan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ze Zhao</a>, <a href="/search/cs?searchtype=author&query=Galdran%2C+A">Adrian Galdran</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Amit Kumar</a>, <a href="/search/cs?searchtype=author&query=Deotale%2C+G">Gunjan Deotale</a>, <a href="/search/cs?searchtype=author&query=Das%2C+D+K">Dev Kumar Das</a>, <a href="/search/cs?searchtype=author&query=Paik%2C+I">Inyoung Paik</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geongyu Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujia Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xuege Hou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zeyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengjin Wang</a>, <a href="/search/cs?searchtype=author&query=Fischer%2C+M">Maximilian Fischer</a> , et al. (22 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07288v1-abstract-short" style="display: inline;"> Chronic kidney disease (CKD) is a major global health issue, affecting over 10% of the population and causing significant mortality. While kidney biopsy remains the gold standard for CKD diagnosis and treatment, the lack of comprehensive benchmarks for kidney pathology segmentation hinders progress in the field. To address this, we organized the Kidney Pathology Image Segmentation (KPIs) Challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07288v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07288v1-abstract-full" style="display: none;"> Chronic kidney disease (CKD) is a major global health issue, affecting over 10% of the population and causing significant mortality. While kidney biopsy remains the gold standard for CKD diagnosis and treatment, the lack of comprehensive benchmarks for kidney pathology segmentation hinders progress in the field. To address this, we organized the Kidney Pathology Image Segmentation (KPIs) Challenge, introducing a dataset that incorporates preclinical rodent models of CKD with over 10,000 annotated glomeruli from 60+ Periodic Acid Schiff (PAS)-stained whole slide images. The challenge includes two tasks, patch-level segmentation and whole slide image segmentation and detection, evaluated using the Dice Similarity Coefficient (DSC) and F1-score. By encouraging innovative segmentation methods that adapt to diverse CKD models and tissue conditions, the KPIs Challenge aims to advance kidney pathology analysis, establish new benchmarks, and enable precise, large-scale quantification for disease research and diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07288v1-abstract-full').style.display = 'none'; document.getElementById('2502.07288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07216">arXiv:2502.07216</a> <span> [<a href="https://arxiv.org/pdf/2502.07216">pdf</a>, <a href="https://arxiv.org/format/2502.07216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681043">10.1145/3664647.3681043 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SparseFormer: Detecting Objects in HRW Shots via Sparse Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenxi Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jilai Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haozhe Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lu Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07216v1-abstract-short" style="display: inline;"> Recent years have seen an increase in the use of gigapixel-level image and video capture systems and benchmarks with high-resolution wide (HRW) shots. However, unlike close-up shots in the MS COCO dataset, the higher resolution and wider field of view raise unique challenges, such as extreme sparsity and huge scale changes, causing existing close-up detectors inaccuracy and inefficiency. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07216v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07216v1-abstract-full" style="display: none;"> Recent years have seen an increase in the use of gigapixel-level image and video capture systems and benchmarks with high-resolution wide (HRW) shots. However, unlike close-up shots in the MS COCO dataset, the higher resolution and wider field of view raise unique challenges, such as extreme sparsity and huge scale changes, causing existing close-up detectors inaccuracy and inefficiency. In this paper, we present a novel model-agnostic sparse vision transformer, dubbed SparseFormer, to bridge the gap of object detection between close-up and HRW shots. The proposed SparseFormer selectively uses attentive tokens to scrutinize the sparsely distributed windows that may contain objects. In this way, it can jointly explore global and local attention by fusing coarse- and fine-grained features to handle huge scale changes. SparseFormer also benefits from a novel Cross-slice non-maximum suppression (C-NMS) algorithm to precisely localize objects from noisy windows and a simple yet effective multi-scale strategy to improve accuracy. Extensive experiments on two HRW benchmarks, PANDA and DOTA-v1.0, demonstrate that the proposed SparseFormer significantly improves detection accuracy (up to 5.8%) and speed (up to 3x) over the state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07216v1-abstract-full').style.display = 'none'; document.getElementById('2502.07216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted to ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06453">arXiv:2502.06453</a> <span> [<a href="https://arxiv.org/pdf/2502.06453">pdf</a>, <a href="https://arxiv.org/format/2502.06453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiacheng Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiang Ji</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yingqing Guo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianle Cai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06453v2-abstract-short" style="display: inline;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06453v2-abstract-full" style="display: none;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underlying reasoning patterns of the solutions. However, no work has explored hard perturbations, which fundamentally change the nature of the problem so that the original solution steps do not apply. To bridge the gap, we construct MATH-P-Simple and MATH-P-Hard via simple perturbation and hard perturbation, respectively. Each consists of 279 perturbed math problems derived from level-5 (hardest) problems in the MATH dataset (Hendrycksmath et. al., 2021). We observe significant performance drops on MATH-P-Hard across various models, including o1-mini (-16.49%) and gemini-2.0-flash-thinking (-12.9%). We also raise concerns about a novel form of memorization where models blindly apply learned problem-solving skills without assessing their applicability to modified contexts. This issue is amplified when using original problems for in-context learning. We call for research efforts to address this challenge, which is critical for developing more robust and reliable reasoning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'none'; document.getElementById('2502.06453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2: fix bugs in Fig. 1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06428">arXiv:2502.06428</a> <span> [<a href="https://arxiv.org/pdf/2502.06428">pdf</a>, <a href="https://arxiv.org/format/2502.06428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoS: Chain-of-Shot Prompting for Long Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jian Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zixu Cheng</a>, <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenyang Si</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shaogang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06428v2-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) struggle with long videos due to the need for excessive visual tokens. These tokens exceed massively the context length of MLLMs, resulting in filled by redundant task-irrelevant shots. How to select shots is an unsolved critical problem: sparse sampling risks missing key details, while exhaustive sampling overwhelms the model with irrelevant content, lead… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06428v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06428v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06428v2-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) struggle with long videos due to the need for excessive visual tokens. These tokens exceed massively the context length of MLLMs, resulting in filled by redundant task-irrelevant shots. How to select shots is an unsolved critical problem: sparse sampling risks missing key details, while exhaustive sampling overwhelms the model with irrelevant content, leading to video misunderstanding. To solve this problem, we propose Chain-of-Shot prompting (CoS). The key idea is to frame shot selection as test-time visual prompt optimisation, choosing shots adaptive to video understanding semantic task by optimising shots-task alignment. CoS has two key parts: (1) a binary video summary mechanism that performs pseudo temporal grounding, discovering a binary coding to identify task-relevant shots, and (2) a video co-reasoning module that deploys the binary coding to pair (learning to align) task-relevant positive shots with irrelevant negative shots. It embeds the optimised shot selections into the original video, facilitating a focus on relevant context to optimize long video understanding. Experiments across three baselines and five datasets demonstrate the effectiveness and adaptability of CoS. Code given in https://lwpyh.github.io/CoS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06428v2-abstract-full').style.display = 'none'; document.getElementById('2502.06428v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A training-free test-time optimisation approach for long video understanding</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06164">arXiv:2502.06164</a> <span> [<a href="https://arxiv.org/pdf/2502.06164">pdf</a>, <a href="https://arxiv.org/format/2502.06164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generalized Temporal Tensor Decomposition with Rank-revealing Latent-ODE </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+P">Panqi Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianlong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weichang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiqing Liu</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+S">Shikai Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06164v1-abstract-short" style="display: inline;"> Tensor decomposition is a fundamental tool for analyzing multi-dimensional data by learning low-rank factors to represent high-order interactions. While recent works on temporal tensor decomposition have made significant progress by incorporating continuous timestamps in latent factors, they still struggle with general tensor data with continuous indexes not only in the temporal mode but also in o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06164v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06164v1-abstract-full" style="display: none;"> Tensor decomposition is a fundamental tool for analyzing multi-dimensional data by learning low-rank factors to represent high-order interactions. While recent works on temporal tensor decomposition have made significant progress by incorporating continuous timestamps in latent factors, they still struggle with general tensor data with continuous indexes not only in the temporal mode but also in other modes, such as spatial coordinates in climate data. Additionally, the problem of determining the tensor rank remains largely unexplored in temporal tensor models. To address these limitations, we propose \underline{G}eneralized temporal tensor decomposition with \underline{R}ank-r\underline{E}vealing laten\underline{T}-ODE (GRET). Our approach encodes continuous spatial indexes as learnable Fourier features and employs neural ODEs in latent space to learn the temporal trajectories of factors. To automatically reveal the rank of temporal tensors, we introduce a rank-revealing Gaussian-Gamma prior over the factor trajectories. We develop an efficient variational inference scheme with an analytical evidence lower bound, enabling sampling-free optimization. Through extensive experiments on both synthetic and real-world datasets, we demonstrate that GRET not only reveals the underlying ranks of temporal tensors but also significantly outperforms existing methods in prediction performance and robustness against noise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06164v1-abstract-full').style.display = 'none'; document.getElementById('2502.06164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05798">arXiv:2502.05798</a> <span> [<a href="https://arxiv.org/pdf/2502.05798">pdf</a>, <a href="https://arxiv.org/format/2502.05798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> StreamDCIM: A Tile-based Streaming Digital CIM Accelerator with Mixed-stationary Cross-forwarding Dataflow for Multimodal Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shantian Qin</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+Z">Ziqing Qiang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhihua Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenming Li</a>, <a href="/search/cs?searchtype=author&query=An%2C+X">Xuejun An</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05798v1-abstract-short" style="display: inline;"> Multimodal Transformers are emerging artificial intelligence (AI) models designed to process a mixture of signals from diverse modalities. Digital computing-in-memory (CIM) architectures are considered promising for achieving high efficiency while maintaining high accuracy. However, current digital CIM-based accelerators exhibit inflexibility in microarchitecture, dataflow, and pipeline to effecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05798v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05798v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05798v1-abstract-full" style="display: none;"> Multimodal Transformers are emerging artificial intelligence (AI) models designed to process a mixture of signals from diverse modalities. Digital computing-in-memory (CIM) architectures are considered promising for achieving high efficiency while maintaining high accuracy. However, current digital CIM-based accelerators exhibit inflexibility in microarchitecture, dataflow, and pipeline to effectively accelerate multimodal Transformer. In this paper, we propose StreamDCIM, a tile-based streaming digital CIM accelerator for multimodal Transformers. It overcomes the above challenges with three features: First, we present a tile-based reconfigurable CIM macro microarchitecture with normal and hybrid reconfigurable modes to improve intra-macro CIM utilization. Second, we implement a mixed-stationary cross-forwarding dataflow with tile-based execution decoupling to exploit tile-level computation parallelism. Third, we introduce a ping-pong-like fine-grained compute-rewriting pipeline to overlap high-latency on-chip CIM rewriting. Experimental results show that StreamDCIM outperforms non-streaming and layer-based streaming CIM-based solutions by geomean 2.63$\times$ and 1.28$\times$ on typical multimodal Transformer models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05798v1-abstract-full').style.display = 'none'; document.getElementById('2502.05798v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2025 IEEE International Symposium on Circuits and Systems (ISCAS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05726">arXiv:2502.05726</a> <span> [<a href="https://arxiv.org/pdf/2502.05726">pdf</a>, <a href="https://arxiv.org/format/2502.05726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improving Environment Novelty Quantification for Effective Unsupervised Environment Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Teoh%2C+J">Jayden Teoh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjun Li</a>, <a href="/search/cs?searchtype=author&query=Varakantham%2C+P">Pradeep Varakantham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05726v1-abstract-short" style="display: inline;"> Unsupervised Environment Design (UED) formalizes the problem of autocurricula through interactive training between a teacher agent and a student agent. The teacher generates new training environments with high learning potential, curating an adaptive curriculum that strengthens the student's ability to handle unseen scenarios. Existing UED methods mainly rely on regret, a metric that measures the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05726v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05726v1-abstract-full" style="display: none;"> Unsupervised Environment Design (UED) formalizes the problem of autocurricula through interactive training between a teacher agent and a student agent. The teacher generates new training environments with high learning potential, curating an adaptive curriculum that strengthens the student's ability to handle unseen scenarios. Existing UED methods mainly rely on regret, a metric that measures the difference between the agent's optimal and actual performance, to guide curriculum design. Regret-driven methods generate curricula that progressively increase environment complexity for the student but overlook environment novelty -- a critical element for enhancing an agent's generalizability. Measuring environment novelty is especially challenging due to the underspecified nature of environment parameters in UED, and existing approaches face significant limitations. To address this, this paper introduces the Coverage-based Evaluation of Novelty In Environment (CENIE) framework. CENIE proposes a scalable, domain-agnostic, and curriculum-aware approach to quantifying environment novelty by leveraging the student's state-action space coverage from previous curriculum experiences. We then propose an implementation of CENIE that models this coverage and measures environment novelty using Gaussian Mixture Models. By integrating both regret and novelty as complementary objectives for curriculum design, CENIE facilitates effective exploration across the state-action space while progressively increasing curriculum complexity. Empirical evaluations demonstrate that augmenting existing regret-based UED algorithms with CENIE achieves state-of-the-art performance across multiple benchmarks, underscoring the effectiveness of novelty-driven autocurricula for robust generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05726v1-abstract-full').style.display = 'none'; document.getElementById('2502.05726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05629">arXiv:2502.05629</a> <span> [<a href="https://arxiv.org/pdf/2502.05629">pdf</a>, <a href="https://arxiv.org/format/2502.05629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> TrackDiffuser: Nearly Model-Free Bayesian Filtering with Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangguang He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minzhe Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Juan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05629v1-abstract-short" style="display: inline;"> State estimation remains a fundamental challenge across numerous domains, from autonomous driving, aircraft tracking to quantum system control. Although Bayesian filtering has been the cornerstone solution, its classical model-based paradigm faces two major limitations: it struggles with inaccurate state space model (SSM) and requires extensive prior knowledge of noise characteristics. We present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05629v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05629v1-abstract-full" style="display: none;"> State estimation remains a fundamental challenge across numerous domains, from autonomous driving, aircraft tracking to quantum system control. Although Bayesian filtering has been the cornerstone solution, its classical model-based paradigm faces two major limitations: it struggles with inaccurate state space model (SSM) and requires extensive prior knowledge of noise characteristics. We present TrackDiffuser, a generative framework addressing both challenges by reformulating Bayesian filtering as a conditional diffusion model. Our approach implicitly learns system dynamics from data to mitigate the effects of inaccurate SSM, while simultaneously circumventing the need for explicit measurement models and noise priors by establishing a direct relationship between measurements and states. Through an implicit predict-and-update mechanism, TrackDiffuser preserves the interpretability advantage of traditional model-based filtering methods. Extensive experiments demonstrate that our framework substantially outperforms both classical and contemporary hybrid methods, especially in challenging non-linear scenarios involving non-Gaussian noises. Notably, TrackDiffuser exhibits remarkable robustness to SSM inaccuracies, offering a practical solution for real-world state estimation problems where perfect models and prior knowledge are unavailable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05629v1-abstract-full').style.display = 'none'; document.getElementById('2502.05629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05612">arXiv:2502.05612</a> <span> [<a href="https://arxiv.org/pdf/2502.05612">pdf</a>, <a href="https://arxiv.org/format/2502.05612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Rambler in the Wild: A Diary Study of LLM-Assisted Writing With Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wengxi Li</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+G">Matthew G. Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoyang Li</a>, <a href="/search/cs?searchtype=author&query=Zamfirescu-Pereira%2C+J+D">J. D. Zamfirescu-Pereira</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Can Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05612v1-abstract-short" style="display: inline;"> Speech-to-text technologies have been shown to improve text input efficiency and potentially lower the barriers to writing. Recent LLM-assisted dictation tools aim to support writing with speech by bridging the gaps between speaking and traditional writing. This case study reports on the real-world writing experiences of twelve academic or creative writers using one such tool, Rambler, to write va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05612v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05612v1-abstract-full" style="display: none;"> Speech-to-text technologies have been shown to improve text input efficiency and potentially lower the barriers to writing. Recent LLM-assisted dictation tools aim to support writing with speech by bridging the gaps between speaking and traditional writing. This case study reports on the real-world writing experiences of twelve academic or creative writers using one such tool, Rambler, to write various pieces such as blog posts, diaries, screenplays, notes, or fictional stories, etc. Through a ten-day diary study, we identified the participants' in-context writing strategies using Rambler, such as how they expanded from an outline or organized their loose thoughts for different writing goals. The interviews uncovered the psychological and productivity affordances of writing with speech, pointing to future directions of designing for this writing modality and the utilization of AI support. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05612v1-abstract-full').style.display = 'none'; document.getElementById('2502.05612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05534">arXiv:2502.05534</a> <span> [<a href="https://arxiv.org/pdf/2502.05534">pdf</a>, <a href="https://arxiv.org/format/2502.05534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fg-T2M++: LLMs-Augmented Fine-Grained Text Driven Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiapeng Liu</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+Z">Zhiying Leng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F+W+B">Frederick W. B. Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaohui Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05534v1-abstract-short" style="display: inline;"> We address the challenging problem of fine-grained text-driven human motion generation. Existing works generate imprecise motions that fail to accurately capture relationships specified in text due to: (1) lack of effective text parsing for detailed semantic cues regarding body parts, (2) not fully modeling linguistic structures between words to comprehend text comprehensively. To tackle these lim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05534v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05534v1-abstract-full" style="display: none;"> We address the challenging problem of fine-grained text-driven human motion generation. Existing works generate imprecise motions that fail to accurately capture relationships specified in text due to: (1) lack of effective text parsing for detailed semantic cues regarding body parts, (2) not fully modeling linguistic structures between words to comprehend text comprehensively. To tackle these limitations, we propose a novel fine-grained framework Fg-T2M++ that consists of: (1) an LLMs semantic parsing module to extract body part descriptions and semantics from text, (2) a hyperbolic text representation module to encode relational information between text units by embedding the syntactic dependency graph into hyperbolic space, and (3) a multi-modal fusion module to hierarchically fuse text and motion features. Extensive experiments on HumanML3D and KIT-ML datasets demonstrate that Fg-T2M++ outperforms SOTA methods, validating its ability to accurately generate motions adhering to comprehensive text semantics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05534v1-abstract-full').style.display = 'none'; document.getElementById('2502.05534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05264">arXiv:2502.05264</a> <span> [<a href="https://arxiv.org/pdf/2502.05264">pdf</a>, <a href="https://arxiv.org/format/2502.05264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quantum automated learning with provable and explainable trainability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qi Ye</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+S">Shuangyue Geng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zizhao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weikang Li</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L+-">L. -M. Duan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+D">Dong-Ling Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05264v1-abstract-short" style="display: inline;"> Machine learning is widely believed to be one of the most promising practical applications of quantum computing. Existing quantum machine learning schemes typically employ a quantum-classical hybrid approach that relies crucially on gradients of model parameters. Such an approach lacks provable convergence to global minima and will become infeasible as quantum learning models scale up. Here, we in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05264v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05264v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05264v1-abstract-full" style="display: none;"> Machine learning is widely believed to be one of the most promising practical applications of quantum computing. Existing quantum machine learning schemes typically employ a quantum-classical hybrid approach that relies crucially on gradients of model parameters. Such an approach lacks provable convergence to global minima and will become infeasible as quantum learning models scale up. Here, we introduce quantum automated learning, where no variational parameter is involved and the training process is converted to quantum state preparation. In particular, we encode training data into unitary operations and iteratively evolve a random initial state under these unitaries and their inverses, with a target-oriented perturbation towards higher prediction accuracy sandwiched in between. Under reasonable assumptions, we rigorously prove that the evolution converges exponentially to the desired state corresponding to the global minimum of the loss function. We show that such a training process can be understood from the perspective of preparing quantum states by imaginary time evolution, where the data-encoded unitaries together with target-oriented perturbations would train the quantum learning model in an automated fashion. We further prove that the quantum automated learning paradigm features good generalization ability with the generalization error upper bounded by the ratio between a logarithmic function of the Hilbert space dimension and the number of training samples. In addition, we carry out extensive numerical simulations on real-life images and quantum data to demonstrate the effectiveness of our approach and validate the assumptions. Our results establish an unconventional quantum learning strategy that is gradient-free with provable and explainable trainability, which would be crucial for large-scale practical applications of quantum computing in machine learning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05264v1-abstract-full').style.display = 'none'; document.getElementById('2502.05264v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05179">arXiv:2502.05179</a> <span> [<a href="https://arxiv.org/pdf/2502.05179">pdf</a>, <a href="https://arxiv.org/format/2502.05179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlashVideo:Flowing Fidelity to Detail for Efficient High-Resolution Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shilong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shoufa Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chongjian Ge</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peize Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yida Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yi Jiang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zehuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Binyue Peng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05179v1-abstract-short" style="display: inline;"> DiT diffusion models have achieved great success in text-to-video generation, leveraging their scalability in model capacity and data scale. High content and motion fidelity aligned with text prompts, however, often require large model parameters and a substantial number of function evaluations (NFEs). Realistic and visually appealing details are typically reflected in high resolution outputs, fur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05179v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05179v1-abstract-full" style="display: none;"> DiT diffusion models have achieved great success in text-to-video generation, leveraging their scalability in model capacity and data scale. High content and motion fidelity aligned with text prompts, however, often require large model parameters and a substantial number of function evaluations (NFEs). Realistic and visually appealing details are typically reflected in high resolution outputs, further amplifying computational demands especially for single stage DiT models. To address these challenges, we propose a novel two stage framework, FlashVideo, which strategically allocates model capacity and NFEs across stages to balance generation fidelity and quality. In the first stage, prompt fidelity is prioritized through a low resolution generation process utilizing large parameters and sufficient NFEs to enhance computational efficiency. The second stage establishes flow matching between low and high resolutions, effectively generating fine details with minimal NFEs. Quantitative and visual results demonstrate that FlashVideo achieves state-of-the-art high resolution video generation with superior computational efficiency. Additionally, the two-stage design enables users to preview the initial output before committing to full resolution generation, thereby significantly reducing computational costs and wait times as well as enhancing commercial viability . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05179v1-abstract-full').style.display = 'none'; document.getElementById('2502.05179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Model and Weight: https://github.com/FoundationVision/FlashVideo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04377">arXiv:2502.04377</a> <span> [<a href="https://arxiv.org/pdf/2502.04377">pdf</a>, <a href="https://arxiv.org/format/2502.04377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MapFusion: A Novel BEV Feature Fusion Network for Multi-modal Map Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+Y">Yunfeng Diao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mengchuan Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+P">Peng Hao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Rong Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiming Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shu Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04377v1-abstract-short" style="display: inline;"> Map construction task plays a vital role in providing precise and comprehensive static environmental information essential for autonomous driving systems. Primary sensors include cameras and LiDAR, with configurations varying between camera-only, LiDAR-only, or camera-LiDAR fusion, based on cost-performance considerations. While fusion-based methods typically perform best, existing approaches ofte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04377v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04377v1-abstract-full" style="display: none;"> Map construction task plays a vital role in providing precise and comprehensive static environmental information essential for autonomous driving systems. Primary sensors include cameras and LiDAR, with configurations varying between camera-only, LiDAR-only, or camera-LiDAR fusion, based on cost-performance considerations. While fusion-based methods typically perform best, existing approaches often neglect modality interaction and rely on simple fusion strategies, which suffer from the problems of misalignment and information loss. To address these issues, we propose MapFusion, a novel multi-modal Bird's-Eye View (BEV) feature fusion method for map construction. Specifically, to solve the semantic misalignment problem between camera and LiDAR BEV features, we introduce the Cross-modal Interaction Transform (CIT) module, enabling interaction between two BEV feature spaces and enhancing feature representation through a self-attention mechanism. Additionally, we propose an effective Dual Dynamic Fusion (DDF) module to adaptively select valuable information from different modalities, which can take full advantage of the inherent information between different modalities. Moreover, MapFusion is designed to be simple and plug-and-play, easily integrated into existing pipelines. We evaluate MapFusion on two map construction tasks, including High-definition (HD) map and BEV map segmentation, to show its versatility and effectiveness. Compared with the state-of-the-art methods, MapFusion achieves 3.6% and 6.2% absolute improvements on the HD map construction and BEV map segmentation tasks on the nuScenes dataset, respectively, demonstrating the superiority of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04377v1-abstract-full').style.display = 'none'; document.getElementById('2502.04377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04347">arXiv:2502.04347</a> <span> [<a href="https://arxiv.org/pdf/2502.04347">pdf</a>, <a href="https://arxiv.org/format/2502.04347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SCALM: Detecting Bad Practices in Smart Contracts Through LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongwei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenkai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04347v1-abstract-short" style="display: inline;"> As the Ethereum platform continues to mature and gain widespread usage, it is crucial to maintain high standards of smart contract writing practices. While bad practices in smart contracts may not directly lead to security issues, they do elevate the risk of encountering problems. Therefore, to understand and avoid these bad practices, this paper introduces the first systematic study of bad practi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04347v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04347v1-abstract-full" style="display: none;"> As the Ethereum platform continues to mature and gain widespread usage, it is crucial to maintain high standards of smart contract writing practices. While bad practices in smart contracts may not directly lead to security issues, they do elevate the risk of encountering problems. Therefore, to understand and avoid these bad practices, this paper introduces the first systematic study of bad practices in smart contracts, delving into over 35 specific issues. Specifically, we propose a large language models (LLMs)-based framework, SCALM. It combines Step-Back Prompting and Retrieval-Augmented Generation (RAG) to identify and address various bad practices effectively. Our extensive experiments using multiple LLMs and datasets have shown that SCALM outperforms existing tools in detecting bad practices in smart contracts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04347v1-abstract-full').style.display = 'none'; document.getElementById('2502.04347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04293">arXiv:2502.04293</a> <span> [<a href="https://arxiv.org/pdf/2502.04293">pdf</a>, <a href="https://arxiv.org/format/2502.04293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GCE-Pose: Global Context Enhancement for Category-level Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Weihang Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongli Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junwen Huang</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hyunjun Jung</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+K">Peter KT Yu</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04293v1-abstract-short" style="display: inline;"> A key challenge in model-free category-level pose estimation is the extraction of contextual object features that generalize across varying instances within a specific category. Recent approaches leverage foundational features to capture semantic and geometry cues from data. However, these approaches fail under partial visibility. We overcome this with a first-complete-then-aggregate strategy for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04293v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04293v1-abstract-full" style="display: none;"> A key challenge in model-free category-level pose estimation is the extraction of contextual object features that generalize across varying instances within a specific category. Recent approaches leverage foundational features to capture semantic and geometry cues from data. However, these approaches fail under partial visibility. We overcome this with a first-complete-then-aggregate strategy for feature extraction utilizing class priors. In this paper, we present GCE-Pose, a method that enhances pose estimation for novel instances by integrating category-level global context prior. GCE-Pose performs semantic shape reconstruction with a proposed Semantic Shape Reconstruction (SSR) module. Given an unseen partial RGB-D object instance, our SSR module reconstructs the instance's global geometry and semantics by deforming category-specific 3D semantic prototypes through a learned deep Linear Shape Model. We further introduce a Global Context Enhanced (GCE) feature fusion module that effectively fuses features from partial RGB-D observations and the reconstructed global context. Extensive experiments validate the impact of our global context prior and the effectiveness of the GCE fusion module, demonstrating that GCE-Pose significantly outperforms existing methods on challenging real-world datasets HouseCat6D and NOCS-REAL275. Our project page is available at https://colin-de.github.io/GCE-Pose/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04293v1-abstract-full').style.display = 'none'; document.getElementById('2502.04293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03755">arXiv:2502.03755</a> <span> [<a href="https://arxiv.org/pdf/2502.03755">pdf</a>, <a href="https://arxiv.org/format/2502.03755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regularization via f-Divergence: An Application to Multi-Oxide Spectroscopic Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Weizhi Li</a>, <a href="/search/cs?searchtype=author&query=Klein%2C+N">Natalie Klein</a>, <a href="/search/cs?searchtype=author&query=Gifford%2C+B">Brendan Gifford</a>, <a href="/search/cs?searchtype=author&query=Sklute%2C+E">Elizabeth Sklute</a>, <a href="/search/cs?searchtype=author&query=Legett%2C+C">Carey Legett</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+S">Samuel Clegg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03755v1-abstract-short" style="display: inline;"> In this paper, we address the task of characterizing the chemical composition of planetary surfaces using convolutional neural networks (CNNs). Specifically, we seek to predict the multi-oxide weights of rock samples based on spectroscopic data collected under Martian conditions. We frame this problem as a multi-target regression task and propose a novel regularization method based on f-divergence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03755v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03755v1-abstract-full" style="display: none;"> In this paper, we address the task of characterizing the chemical composition of planetary surfaces using convolutional neural networks (CNNs). Specifically, we seek to predict the multi-oxide weights of rock samples based on spectroscopic data collected under Martian conditions. We frame this problem as a multi-target regression task and propose a novel regularization method based on f-divergence. The f-divergence regularization is designed to constrain the distributional discrepancy between predictions and noisy targets. This regularizer serves a dual purpose: on the one hand, it mitigates overfitting by enforcing a constraint on the distributional difference between predictions and noisy targets. On the other hand, it acts as an auxiliary loss function, penalizing the neural network when the divergence between the predicted and target distributions becomes too large. To enable backpropagation during neural network training, we develop a differentiable f-divergence and incorporate it into the f-divergence regularization, making the network training feasible. We conduct experiments using spectra collected in a Mars-like environment by the remote-sensing instruments aboard the Curiosity and Perseverance rovers. Experimental results on multi-oxide weight prediction demonstrate that the proposed $f$-divergence regularization performs better than or comparable to standard regularization methods including $L_1$, $L_2$, and dropout. Notably, combining the $f$-divergence regularization with these standard regularization further enhances performance, outperforming each regularization method used independently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03755v1-abstract-full').style.display = 'none'; document.getElementById('2502.03755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02385">arXiv:2502.02385</a> <span> [<a href="https://arxiv.org/pdf/2502.02385">pdf</a>, <a href="https://arxiv.org/format/2502.02385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Achieving Hiding and Smart Anti-Jamming Communication: A Parallel DRL Approach against Moving Reactive Jammer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yangyang Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuhua Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoxin Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhibing Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songyi Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiatao Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinran Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02385v1-abstract-short" style="display: inline;"> This paper addresses the challenge of anti-jamming in moving reactive jamming scenarios. The moving reactive jammer initiates high-power tracking jamming upon detecting any transmission activity, and when unable to detect a signal, resorts to indiscriminate jamming. This presents dual imperatives: maintaining hiding to avoid the jammer's detection and simultaneously evading indiscriminate jamming.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02385v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02385v1-abstract-full" style="display: none;"> This paper addresses the challenge of anti-jamming in moving reactive jamming scenarios. The moving reactive jammer initiates high-power tracking jamming upon detecting any transmission activity, and when unable to detect a signal, resorts to indiscriminate jamming. This presents dual imperatives: maintaining hiding to avoid the jammer's detection and simultaneously evading indiscriminate jamming. Spread spectrum techniques effectively reduce transmitting power to elude detection but fall short in countering indiscriminate jamming. Conversely, changing communication frequencies can help evade indiscriminate jamming but makes the transmission vulnerable to tracking jamming without spread spectrum techniques to remain hidden. Current methodologies struggle with the complexity of simultaneously optimizing these two requirements due to the expansive joint action spaces and the dynamics of moving reactive jammers. To address these challenges, we propose a parallelized deep reinforcement learning (DRL) strategy. The approach includes a parallelized network architecture designed to decompose the action space. A parallel exploration-exploitation selection mechanism replaces the $\varepsilon $-greedy mechanism, accelerating convergence. Simulations demonstrate a nearly 90\% increase in normalized throughput. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02385v1-abstract-full').style.display = 'none'; document.getElementById('2502.02385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository