CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 4,508 results for author: <span class="mathjax">Chen, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Chen%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+X&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17419">arXiv:2502.17419</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.17419">pdf</a>, <a href="https://arxiv.org/format/2502.17419">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From System 1 to System 2: A Survey of Reasoning Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhong-Zhi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Duzhen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Ming-Liang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiaxin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zengyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuxuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haotian Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+J">Junhao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Pei-Jie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yingying Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+F">Fei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jiahua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+L">Le Song</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Cheng-Lin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17419v1-abstract-short" style="display: inline;"> Achieving human-level intelligence requires refining the transition from the fast, intuitive System 1 to the slower, more deliberate System 2 reasoning. While System 1 excels in quick, heuristic decisions, System 2 relies on logical reasoning for more accurate judgments and reduced biases. Foundational Large Language Models (LLMs) excel at fast decision-making but lack the depth for complex reason&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17419v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17419v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17419v1-abstract-full" style="display: none;"> Achieving human-level intelligence requires refining the transition from the fast, intuitive System 1 to the slower, more deliberate System 2 reasoning. While System 1 excels in quick, heuristic decisions, System 2 relies on logical reasoning for more accurate judgments and reduced biases. Foundational Large Language Models (LLMs) excel at fast decision-making but lack the depth for complex reasoning, as they have not yet fully embraced the step-by-step analysis characteristic of true System 2 thinking. Recently, reasoning LLMs like OpenAI&#39;s o1/o3 and DeepSeek&#39;s R1 have demonstrated expert-level performance in fields such as mathematics and coding, closely mimicking the deliberate reasoning of System 2 and showcasing human-like cognitive abilities. This survey begins with a brief overview of the progress in foundational LLMs and the early development of System 2 technologies, exploring how their combination has paved the way for reasoning LLMs. Next, we discuss how to construct reasoning LLMs, analyzing their features, the core methods enabling advanced reasoning, and the evolution of various reasoning LLMs. Additionally, we provide an overview of reasoning benchmarks, offering an in-depth comparison of the performance of representative reasoning LLMs. Finally, we explore promising directions for advancing reasoning LLMs and maintain a real-time \href{https://github.com/zzli2022/Awesome-Slow-Reason-System}{GitHub Repository} to track the latest developments. We hope this survey will serve as a valuable resource to inspire innovation and drive progress in this rapidly evolving field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17419v1-abstract-full').style.display = 'none'; document.getElementById('2502.17419v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Slow-thinking, Large Language Models, Human-like Reasoning, Decision Making in AI, AGI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17414">arXiv:2502.17414</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.17414">pdf</a>, <a href="https://arxiv.org/format/2502.17414">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-Dancer: Expressive Music to Human Dance Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zeyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hongyi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+G">Guoxian Song</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">You Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chenxu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+D">Di Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Linjie Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17414v1-abstract-short" style="display: inline;"> We present X-Dancer, a novel zero-shot music-driven image animation pipeline that creates diverse and long-range lifelike human dance videos from a single static image. As its core, we introduce a unified transformer-diffusion framework, featuring an autoregressive transformer model that synthesize extended and music-synchronized token sequences for 2D body, head and hands poses, which then guide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17414v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17414v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17414v1-abstract-full" style="display: none;"> We present X-Dancer, a novel zero-shot music-driven image animation pipeline that creates diverse and long-range lifelike human dance videos from a single static image. As its core, we introduce a unified transformer-diffusion framework, featuring an autoregressive transformer model that synthesize extended and music-synchronized token sequences for 2D body, head and hands poses, which then guide a diffusion model to produce coherent and realistic dance video frames. Unlike traditional methods that primarily generate human motion in 3D, X-Dancer addresses data limitations and enhances scalability by modeling a wide spectrum of 2D dance motions, capturing their nuanced alignment with musical beats through readily available monocular videos. To achieve this, we first build a spatially compositional token representation from 2D human pose labels associated with keypoint confidences, encoding both large articulated body movements (e.g., upper and lower body) and fine-grained motions (e.g., head and hands). We then design a music-to-motion transformer model that autoregressively generates music-aligned dance pose token sequences, incorporating global attention to both musical style and prior motion context. Finally we leverage a diffusion backbone to animate the reference image with these synthesized pose tokens through AdaIN, forming a fully differentiable end-to-end framework. Experimental results demonstrate that X-Dancer is able to produce both diverse and characterized dance videos, substantially outperforming state-of-the-art methods in term of diversity, expressiveness and realism. Code and model will be available for research purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17414v1-abstract-full').style.display = 'none'; document.getElementById('2502.17414v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17213">arXiv:2502.17213</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.17213">pdf</a>, <a href="https://arxiv.org/format/2502.17213">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Powered Electrical Brain Signals Analysis: Advancing Neurological Diagnostics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiahe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+F">Fanqi Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junru Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuxin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Daoze Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Z">Zhizhang Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+F">Fang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17213v1-abstract-short" style="display: inline;"> Neurological disorders represent significant global health challenges, driving the advancement of brain signal analysis methods. Scalp electroencephalography (EEG) and intracranial electroencephalography (iEEG) are widely used to diagnose and monitor neurological conditions. However, dataset heterogeneity and task variations pose challenges in developing robust deep learning solutions. This review&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17213v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17213v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17213v1-abstract-full" style="display: none;"> Neurological disorders represent significant global health challenges, driving the advancement of brain signal analysis methods. Scalp electroencephalography (EEG) and intracranial electroencephalography (iEEG) are widely used to diagnose and monitor neurological conditions. However, dataset heterogeneity and task variations pose challenges in developing robust deep learning solutions. This review systematically examines recent advances in deep learning approaches for EEG/iEEG-based neurological diagnostics, focusing on applications across 7 neurological conditions using 46 datasets. We explore trends in data utilization, model design, and task-specific adaptations, highlighting the importance of pre-trained multi-task models for scalable, generalizable solutions. To advance research, we propose a standardized benchmark for evaluating models across diverse datasets to enhance reproducibility. This survey emphasizes how recent innovations can transform neurological diagnostics and enable the development of intelligent, adaptable healthcare solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17213v1-abstract-full').style.display = 'none'; document.getElementById('2502.17213v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16880">arXiv:2502.16880</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.16880">pdf</a>, <a href="https://arxiv.org/format/2502.16880">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CORAL: Learning Consistent Representations across Multi-step Training with Lighter Speculative Drafter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Weng%2C+Y">Yepeng Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+D">Dianwen Mei</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+H">Huishi Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xujie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+J">Jiang Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Z">Zhongchao Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16880v1-abstract-short" style="display: inline;"> Speculative decoding is a powerful technique that accelerates Large Language Model (LLM) inference by leveraging a lightweight speculative draft model. However, existing designs suffers in performance due to misalignment between training and inference. Recent methods have tried to solve this issue by adopting a multi-step training strategy, but the complex inputs of different training steps make i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16880v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16880v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16880v1-abstract-full" style="display: none;"> Speculative decoding is a powerful technique that accelerates Large Language Model (LLM) inference by leveraging a lightweight speculative draft model. However, existing designs suffers in performance due to misalignment between training and inference. Recent methods have tried to solve this issue by adopting a multi-step training strategy, but the complex inputs of different training steps make it harder for the draft model to converge. To address this, we propose CORAL, a novel framework that improves both accuracy and efficiency in speculative drafting. CORAL introduces Cross-Step Representation Alignment, a method that enhances consistency across multiple training steps, significantly improving speculative drafting performance. Additionally, we identify the LM head as a major bottleneck in the inference speed of the draft model. We introduce a weight-grouping mechanism that selectively activates a subset of LM head parameters during inference, substantially reducing the latency of the draft model. We evaluate CORAL on three LLM families and three benchmark datasets, achieving speedup ratios of 2.50x-4.07x, outperforming state-of-the-art methods such as EAGLE-2 and HASS. Our results demonstrate that CORAL effectively mitigates training-inference misalignment and delivers significant speedup for modern LLMs with large vocabularies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16880v1-abstract-full').style.display = 'none'; document.getElementById('2502.16880v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16548">arXiv:2502.16548</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.16548">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Composable Strategy Framework with Integrated Video-Text based Large Language Models for Heart Failure Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jianzhou Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiumei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jinyang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+H">Heyu Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+G">Guo Song</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yuji Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xingping Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+R">Rong Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16548v1-abstract-short" style="display: inline;"> Heart failure is one of the leading causes of death worldwide, with millons of deaths each year, according to data from the World Health Organization (WHO) and other public health agencies. While significant progress has been made in the field of heart failure, leading to improved survival rates and improvement of ejection fraction, there remains substantial unmet needs, due to the complexity and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16548v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16548v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16548v1-abstract-full" style="display: none;"> Heart failure is one of the leading causes of death worldwide, with millons of deaths each year, according to data from the World Health Organization (WHO) and other public health agencies. While significant progress has been made in the field of heart failure, leading to improved survival rates and improvement of ejection fraction, there remains substantial unmet needs, due to the complexity and multifactorial characteristics. Therefore, we propose a composable strategy framework for assessment and treatment optimization in heart failure. This framework simulates the doctor-patient consultation process and leverages multi-modal algorithms to analyze a range of data, including video, physical examination, text results as well as medical history. By integrating these various data sources, our framework offers a more holistic evaluation and optimized treatment plan for patients. Our results demonstrate that this multi-modal approach outperforms single-modal artificial intelligence (AI) algorithms in terms of accuracy in heart failure (HF) prognosis prediction. Through this method, we can further evaluate the impact of various pathological indicators on HF prognosis,providing a more comprehensive evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16548v1-abstract-full').style.display = 'none'; document.getElementById('2502.16548v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16355">arXiv:2502.16355</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.16355">pdf</a>, <a href="https://arxiv.org/ps/2502.16355">ps</a>, <a href="https://arxiv.org/format/2502.16355">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Monotonicity Testing of High-Dimensional Distributions with Subcube Conditioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chakrabarty%2C+D">Deeparnab Chakrabarty</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ristic%2C+S">Simeon Ristic</a>, <a href="/search/cs?searchtype=author&amp;query=Seshadhri%2C+C">C. Seshadhri</a>, <a href="/search/cs?searchtype=author&amp;query=Waingarten%2C+E">Erik Waingarten</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16355v1-abstract-short" style="display: inline;"> We study monotonicity testing of high-dimensional distributions on $\{-1,1\}^n$ in the model of subcube conditioning, suggested and studied by Canonne, Ron, and Servedio~\cite{CRS15} and Bhattacharyya and Chakraborty~\cite{BC18}. Previous work shows that the \emph{sample complexity} of monotonicity testing must be exponential in $n$ (Rubinfeld, Vasilian~\cite{RV20}, and Aliakbarpour, Gouleakis, Pe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16355v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16355v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16355v1-abstract-full" style="display: none;"> We study monotonicity testing of high-dimensional distributions on $\{-1,1\}^n$ in the model of subcube conditioning, suggested and studied by Canonne, Ron, and Servedio~\cite{CRS15} and Bhattacharyya and Chakraborty~\cite{BC18}. Previous work shows that the \emph{sample complexity} of monotonicity testing must be exponential in $n$ (Rubinfeld, Vasilian~\cite{RV20}, and Aliakbarpour, Gouleakis, Peebles, Rubinfeld, Yodpinyanee~\cite{AGPRY19}). We show that the subcube \emph{query complexity} is $\tilde螛(n/\varepsilon^2)$, by proving nearly matching upper and lower bounds. Our work is the first to use directed isoperimetric inequalities (developed for function monotonicity testing) for analyzing a distribution testing algorithm. Along the way, we generalize an inequality of Khot, Minzer, and Safra~\cite{KMS18} to real-valued functions on $\{-1,1\}^n$. We also study uniformity testing of distributions that are promised to be monotone, a problem introduced by Rubinfeld, Servedio~\cite{RS09} , using subcube conditioning. We show that the query complexity is $\tilde螛(\sqrt{n}/\varepsilon^2)$. Our work proves the lower bound, which matches (up to poly-logarithmic factors) the uniformity testing upper bound for general distributions (Canonne, Chen, Kamath, Levi, Waingarten~\cite{CCKLW21}). Hence, we show that monotonicity does not help, beyond logarithmic factors, in testing uniformity of distributions with subcube conditional queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16355v1-abstract-full').style.display = 'none'; document.getElementById('2502.16355v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15888">arXiv:2502.15888</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15888">pdf</a>, <a href="https://arxiv.org/format/2502.15888">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Understanding and Evaluating Hallucinations in 3D Visual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+R">Ruiying Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kaiyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weichen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinlei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15888v1-abstract-short" style="display: inline;"> Recently, 3D-LLMs, which combine point-cloud encoders with large models, have been proposed to tackle complex tasks in embodied intelligence and scene understanding. In addition to showing promising results on 3D tasks, we found that they are significantly affected by hallucinations. For instance, they may generate objects that do not exist in the scene or produce incorrect relationships between o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15888v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15888v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15888v1-abstract-full" style="display: none;"> Recently, 3D-LLMs, which combine point-cloud encoders with large models, have been proposed to tackle complex tasks in embodied intelligence and scene understanding. In addition to showing promising results on 3D tasks, we found that they are significantly affected by hallucinations. For instance, they may generate objects that do not exist in the scene or produce incorrect relationships between objects. To investigate this issue, this work presents the first systematic study of hallucinations in 3D-LLMs. We begin by quickly evaluating hallucinations in several representative 3D-LLMs and reveal that they are all significantly affected by hallucinations. We then define hallucinations in 3D scenes and, through a detailed analysis of datasets, uncover the underlying causes of these hallucinations. We find three main causes: (1) Uneven frequency distribution of objects in the dataset. (2) Strong correlations between objects. (3) Limited diversity in object attributes. Additionally, we propose new evaluation metrics for hallucinations, including Random Point Cloud Pair and Opposite Question Evaluations, to assess whether the model generates responses based on visual information and aligns it with the text&#39;s meaning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15888v1-abstract-full').style.display = 'none'; document.getElementById('2502.15888v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15702">arXiv:2502.15702</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15702">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large language models streamline automated systematic review: A preliminary study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15702v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown promise in natural language processing tasks, with the potential to automate systematic reviews. This study evaluates the performance of three state-of-the-art LLMs in conducting systematic review tasks. We assessed GPT-4, Claude-3, and Mistral 8x7B across four systematic review tasks: study design formulation, search strategy development, literature screeni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15702v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15702v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15702v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown promise in natural language processing tasks, with the potential to automate systematic reviews. This study evaluates the performance of three state-of-the-art LLMs in conducting systematic review tasks. We assessed GPT-4, Claude-3, and Mistral 8x7B across four systematic review tasks: study design formulation, search strategy development, literature screening, and data extraction. Sourced from a previously published systematic review, we provided reference standard including standard PICO (Population, Intervention, Comparison, Outcome) design, standard eligibility criteria, and data from 20 reference literature. Three investigators evaluated the quality of study design and eligibility criteria using 5-point Liker Scale in terms of accuracy, integrity, relevance, consistency and overall performance. For other tasks, the output is defined as accurate if it is the same as the reference standard. Search strategy performance was evaluated through accuracy and retrieval efficacy. Screening accuracy was assessed for both abstracts screening and full texts screening. Data extraction accuracy was evaluated across 1,120 data points comprising 3,360 individual fields. Claude-3 demonstrated superior overall performance in PICO design. In search strategy formulation, GPT-4 and Claude-3 achieved comparable accuracy, outperforming Mistral. For abstract screening, GPT-4 achieved the highest accuracy, followed by Mistral and Claude-3. In data extraction, GPT-4 significantly outperformed other models. LLMs demonstrate potential for automating systematic review tasks, with GPT-4 showing superior performance in search strategy formulation, literature screening and data extraction. These capabilities make them promising assistive tools for researchers and warrant further development and validation in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15702v1-abstract-full').style.display = 'none'; document.getElementById('2502.15702v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15616">arXiv:2502.15616</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15616">pdf</a>, <a href="https://arxiv.org/format/2502.15616">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pastiche Novel Generation Creating: Fan Fiction You Love in Your Favorite Author&#39;s Style </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xueran Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuhan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingzhe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Sen Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15616v1-abstract-short" style="display: inline;"> Great novels create immersive worlds with rich character arcs, well-structured plots, and nuanced writing styles. However, current novel generation methods often rely on brief, simplistic story outlines and generate details using plain, generic language. To bridge this gap, we introduce the task of Pastiche Novel Generation, which requires the generated novels to imitate the distinctive features o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15616v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15616v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15616v1-abstract-full" style="display: none;"> Great novels create immersive worlds with rich character arcs, well-structured plots, and nuanced writing styles. However, current novel generation methods often rely on brief, simplistic story outlines and generate details using plain, generic language. To bridge this gap, we introduce the task of Pastiche Novel Generation, which requires the generated novels to imitate the distinctive features of the original work, including understanding character profiles, predicting plausible plot developments, and writing concrete details using vivid, expressive language. To achieve this, we propose WriterAgent, a novel generation system designed to master the core aspects of literary pastiche. WriterAgent is trained through a curriculum learning paradigm, progressing from low-level stylistic mastery to high-level narrative coherence. Its key tasks include language style learning, character modeling, plot planning, and stylish writing, ensuring comprehensive narrative control. To support this, WriterAgent leverages the WriterLoRA framework, an extension of LoRA with hierarchical and cumulative task-specific modules, each specializing in a different narrative aspect. We evaluate WriterAgent on multilingual classics like Harry Potter and Dream of the Red Chamber, demonstrating its superiority over baselines in capturing the target author&#39;s settings, character dynamics, and writing style to produce coherent, faithful narratives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15616v1-abstract-full').style.display = 'none'; document.getElementById('2502.15616v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15609">arXiv:2502.15609</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15609">pdf</a>, <a href="https://arxiv.org/format/2502.15609">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the Robustness of Transformers against Context Hijacking for Linear Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianle Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chenyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingwu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+D">Difan Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15609v1-abstract-short" style="display: inline;"> Transformer-based Large Language Models (LLMs) have demonstrated powerful in-context learning capabilities. However, their predictions can be disrupted by factually correct context, a phenomenon known as context hijacking, revealing a significant robustness issue. To understand this phenomenon theoretically, we explore an in-context linear classification problem based on recent advances in linear&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15609v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15609v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15609v1-abstract-full" style="display: none;"> Transformer-based Large Language Models (LLMs) have demonstrated powerful in-context learning capabilities. However, their predictions can be disrupted by factually correct context, a phenomenon known as context hijacking, revealing a significant robustness issue. To understand this phenomenon theoretically, we explore an in-context linear classification problem based on recent advances in linear transformers. In our setup, context tokens are designed as factually correct query-answer pairs, where the queries are similar to the final query but have opposite labels. Then, we develop a general theoretical analysis on the robustness of the linear transformers, which is formulated as a function of the model depth, training context lengths, and number of hijacking context tokens. A key finding is that a well-trained deeper transformer can achieve higher robustness, which aligns with empirical observations. We show that this improvement arises because deeper layers enable more fine-grained optimization steps, effectively mitigating interference from context hijacking. This is also well supported by our numerical experiments. Our findings provide theoretical insights into the benefits of deeper architectures and contribute to enhancing the understanding of transformer architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15609v1-abstract-full').style.display = 'none'; document.getElementById('2502.15609v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15237">arXiv:2502.15237</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15237">pdf</a>, <a href="https://arxiv.org/format/2502.15237">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> From Documents to Dialogue: Building KG-RAG Enhanced AI Assistants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mukherjee%2C+M">Manisha Mukherjee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+D">Dan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Mai%2C+T">Tung Mai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15237v1-abstract-short" style="display: inline;"> The Adobe Experience Platform AI Assistant is a conversational tool that enables organizations to interact seamlessly with proprietary enterprise data through a chatbot. However, due to access restrictions, Large Language Models (LLMs) cannot retrieve these internal documents, limiting their ability to generate accurate zero-shot responses. To overcome this limitation, we use a Retrieval-Augmented&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15237v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15237v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15237v1-abstract-full" style="display: none;"> The Adobe Experience Platform AI Assistant is a conversational tool that enables organizations to interact seamlessly with proprietary enterprise data through a chatbot. However, due to access restrictions, Large Language Models (LLMs) cannot retrieve these internal documents, limiting their ability to generate accurate zero-shot responses. To overcome this limitation, we use a Retrieval-Augmented Generation (RAG) framework powered by a Knowledge Graph (KG) to retrieve relevant information from external knowledge sources, enabling LLMs to answer questions over private or previously unseen document collections. In this paper, we propose a novel approach for building a high-quality, low-noise KG. We apply several techniques, including incremental entity resolution using seed concepts, similarity-based filtering to deduplicate entries, assigning confidence scores to entity-relation pairs to filter for high-confidence pairs, and linking facts to source documents for provenance. Our KG-RAG system retrieves relevant tuples, which are added to the user prompts context before being sent to the LLM generating the response. Our evaluation demonstrates that this approach significantly enhances response relevance, reducing irrelevant answers by over 50% and increasing fully relevant answers by 88% compared to the existing production system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15237v1-abstract-full').style.display = 'none'; document.getElementById('2502.15237v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15172">arXiv:2502.15172</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15172">pdf</a>, <a href="https://arxiv.org/format/2502.15172">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BP-GPT: Auditory Neural Decoding Using fMRI-prompted LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+C">Changde Du</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+H">Huiguang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15172v1-abstract-short" style="display: inline;"> Decoding language information from brain signals represents a vital research area within brain-computer interfaces, particularly in the context of deciphering the semantic information from the fMRI signal. Although existing work uses LLM to achieve this goal, their method does not use an end-to-end approach and avoids the LLM in the mapping of fMRI-to-text, leaving space for the exploration of the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15172v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15172v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15172v1-abstract-full" style="display: none;"> Decoding language information from brain signals represents a vital research area within brain-computer interfaces, particularly in the context of deciphering the semantic information from the fMRI signal. Although existing work uses LLM to achieve this goal, their method does not use an end-to-end approach and avoids the LLM in the mapping of fMRI-to-text, leaving space for the exploration of the LLM in auditory decoding. In this paper, we introduce a novel method, the Brain Prompt GPT (BP-GPT). By using the brain representation that is extracted from the fMRI as a prompt, our method can utilize GPT-2 to decode fMRI signals into stimulus text. Further, we introduce the text prompt and align the fMRI prompt to it. By introducing the text prompt, our BP-GPT can extract a more robust brain prompt and promote the decoding of pre-trained LLM. We evaluate our BP-GPT on the open-source auditory semantic decoding dataset and achieve a significant improvement up to 4.61 on METEOR and 2.43 on BERTScore across all the subjects compared to the state-of-the-art method. The experimental results demonstrate that using brain representation as a prompt to further drive LLM for auditory neural decoding is feasible and effective. The code is available at https://github.com/1994cxy/BP-GPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15172v1-abstract-full').style.display = 'none'; document.getElementById('2502.15172v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2405.07840</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15130">arXiv:2502.15130</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15130">pdf</a>, <a href="https://arxiv.org/format/2502.15130">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TransMamba: Fast Universal Architecture Adaption from Transformers to Mamba </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuwei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+S">Sihao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zisheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15130v1-abstract-short" style="display: inline;"> Transformers have been favored in both uni-modal and multi-modal foundation models for their flexible scalability in attention modules. Consequently, a number of pre-trained Transformer models, e.g., LLaVA, CLIP, and DEIT, are publicly available. Recent research has introduced subquadratic architectures like Mamba, which enables global awareness with linear complexity. Nevertheless, training speci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15130v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15130v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15130v1-abstract-full" style="display: none;"> Transformers have been favored in both uni-modal and multi-modal foundation models for their flexible scalability in attention modules. Consequently, a number of pre-trained Transformer models, e.g., LLaVA, CLIP, and DEIT, are publicly available. Recent research has introduced subquadratic architectures like Mamba, which enables global awareness with linear complexity. Nevertheless, training specialized subquadratic architectures from scratch for certain tasks is both resource-intensive and time-consuming. As a motivator, we explore cross-architecture training to transfer the ready knowledge in existing Transformer models to alternative architecture Mamba, termed TransMamba. Our approach employs a two-stage strategy to expedite training new Mamba models, ensuring effectiveness in across uni-modal and cross-modal tasks. Concerning architecture disparities, we project the intermediate features into an aligned latent space before transferring knowledge. On top of that, a Weight Subcloning and Adaptive Bidirectional distillation method (WSAB) is introduced for knowledge transfer without limitations on varying layer counts. For cross-modal learning, we propose a cross-Mamba module that integrates language awareness into Mamba&#39;s visual features, enhancing the cross-modal interaction capabilities of Mamba architecture. Despite using less than 75% of the training data typically required for training from scratch, TransMamba boasts substantially stronger performance across various network architectures and downstream tasks, including image classification, visual question answering, and text-video retrieval. The code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15130v1-abstract-full').style.display = 'none'; document.getElementById('2502.15130v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15127">arXiv:2502.15127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15127">pdf</a>, <a href="https://arxiv.org/format/2502.15127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Imitation Game for Educational AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinghe Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15127v1-abstract-short" style="display: inline;"> As artificial intelligence systems become increasingly prevalent in education, a fundamental challenge emerges: how can we verify if an AI truly understands how students think and reason? Traditional evaluation methods like measuring learning gains require lengthy studies confounded by numerous variables. We present a novel evaluation framework based on a two-phase Turing-like test. In Phase 1, st&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15127v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15127v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15127v1-abstract-full" style="display: none;"> As artificial intelligence systems become increasingly prevalent in education, a fundamental challenge emerges: how can we verify if an AI truly understands how students think and reason? Traditional evaluation methods like measuring learning gains require lengthy studies confounded by numerous variables. We present a novel evaluation framework based on a two-phase Turing-like test. In Phase 1, students provide open-ended responses to questions, revealing natural misconceptions. In Phase 2, both AI and human experts, conditioned on each student&#39;s specific mistakes, generate distractors for new related questions. By analyzing whether students select AI-generated distractors at rates similar to human expert-generated ones, we can validate if the AI models student cognition. We prove this evaluation must be conditioned on individual responses - unconditioned approaches merely target common misconceptions. Through rigorous statistical sampling theory, we establish precise requirements for high-confidence validation. Our research positions conditioned distractor generation as a probe into an AI system&#39;s fundamental ability to model student thinking - a capability that enables adapting tutoring, feedback, and assessments to each student&#39;s specific needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15127v1-abstract-full').style.display = 'none'; document.getElementById('2502.15127v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14992">arXiv:2502.14992</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14992">pdf</a>, <a href="https://arxiv.org/format/2502.14992">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Ultra-High-Frequency Harmony: mmWave Radar and Event Camera Orchestrate Accurate Drone Landing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jingao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+X">Xinyu Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuecheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Ting Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+R">Ruiyang Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yunhao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinlei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14992v1-abstract-short" style="display: inline;"> For precise, efficient, and safe drone landings, ground platforms should real-time, accurately locate descending drones and guide them to designated spots. While mmWave sensing combined with cameras improves localization accuracy, the lower sampling frequency of traditional frame cameras compared to mmWave radar creates bottlenecks in system throughput. In this work, we replace the traditional fra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14992v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14992v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14992v1-abstract-full" style="display: none;"> For precise, efficient, and safe drone landings, ground platforms should real-time, accurately locate descending drones and guide them to designated spots. While mmWave sensing combined with cameras improves localization accuracy, the lower sampling frequency of traditional frame cameras compared to mmWave radar creates bottlenecks in system throughput. In this work, we replace the traditional frame camera with event camera, a novel sensor that harmonizes in sampling frequency with mmWave radar within the ground platform setup, and introduce mmE-Loc, a high-precision, low-latency ground localization system designed for drone landings. To fully leverage the \textit{temporal consistency} and \textit{spatial complementarity} between these modalities, we propose two innovative modules, \textit{consistency-instructed collaborative tracking} and \textit{graph-informed adaptive joint optimization}, for accurate drone measurement extraction and efficient sensor fusion. Extensive real-world experiments in landing scenarios from a leading drone delivery company demonstrate that mmE-Loc outperforms state-of-the-art methods in both localization accuracy and latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14992v1-abstract-full').style.display = 'none'; document.getElementById('2502.14992v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by ACM SenSys 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14504">arXiv:2502.14504</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14504">pdf</a>, <a href="https://arxiv.org/format/2502.14504">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PLPHP: Per-Layer Per-Head Vision Token Pruning for Efficient Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Y">Yu Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kaiyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chenran Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinlei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaoping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14504v1-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have demonstrated remarkable capabilities across a range of multimodal tasks. However, their inference efficiency is constrained by the large number of visual tokens processed during decoding. To address this challenge, we propose Per-Layer Per-Head Vision Token Pruning (PLPHP), a two-level fine-grained pruning method including Layer-Level Retention Rate Alloca&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14504v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14504v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14504v1-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have demonstrated remarkable capabilities across a range of multimodal tasks. However, their inference efficiency is constrained by the large number of visual tokens processed during decoding. To address this challenge, we propose Per-Layer Per-Head Vision Token Pruning (PLPHP), a two-level fine-grained pruning method including Layer-Level Retention Rate Allocation and Head-Level Vision Token Pruning. Motivated by the Vision Token Re-attention phenomenon across decoder layers, we dynamically adjust token retention rates layer by layer. Layers that exhibit stronger attention to visual information preserve more vision tokens, while layers with lower vision attention are aggressively pruned. Furthermore, PLPHP applies pruning at the attention head level, enabling different heads within the same layer to independently retain critical context. Experiments on multiple benchmarks demonstrate that PLPHP delivers an 18% faster decoding speed and reduces the Key-Value Cache (KV Cache) size by over 50%, all at the cost of 0.46% average performance drop, while also achieving notable performance improvements in multi-image tasks. These results highlight the effectiveness of fine-grained token pruning and contribute to advancing the efficiency and scalability of LVLMs. Our source code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14504v1-abstract-full').style.display = 'none'; document.getElementById('2502.14504v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14296">arXiv:2502.14296</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14296">pdf</a>, <a href="https://arxiv.org/format/2502.14296">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> On the Trustworthiness of Generative Foundation Models: Guideline, Assessment, and Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chujie Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Siyuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiangqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yujun Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+J">Jiayi Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+J">Jiawen Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qihui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+H">Han Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhaoyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+T">Tianrui Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dongping Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Ruoxi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+K">Kehan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+A">Andy Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Kuen-Yew%2C+B+H">Bryan Hooi Kuen-Yew</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Stengel-Eskin%2C+E">Elias Stengel-Eskin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+H">Hongzhi Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Huaxiu Yao</a> , et al. (41 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14296v1-abstract-short" style="display: inline;"> Generative Foundation Models (GenFMs) have emerged as transformative tools. However, their widespread adoption raises critical concerns regarding trustworthiness across dimensions. This paper presents a comprehensive framework to address these challenges through three key contributions. First, we systematically review global AI governance laws and policies from governments and regulatory bodies, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14296v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14296v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14296v1-abstract-full" style="display: none;"> Generative Foundation Models (GenFMs) have emerged as transformative tools. However, their widespread adoption raises critical concerns regarding trustworthiness across dimensions. This paper presents a comprehensive framework to address these challenges through three key contributions. First, we systematically review global AI governance laws and policies from governments and regulatory bodies, as well as industry practices and standards. Based on this analysis, we propose a set of guiding principles for GenFMs, developed through extensive multidisciplinary collaboration that integrates technical, ethical, legal, and societal perspectives. Second, we introduce TrustGen, the first dynamic benchmarking platform designed to evaluate trustworthiness across multiple dimensions and model types, including text-to-image, large language, and vision-language models. TrustGen leverages modular components--metadata curation, test case generation, and contextual variation--to enable adaptive and iterative assessments, overcoming the limitations of static evaluation methods. Using TrustGen, we reveal significant progress in trustworthiness while identifying persistent challenges. Finally, we provide an in-depth discussion of the challenges and future directions for trustworthy GenFMs, which reveals the complex, evolving nature of trustworthiness, highlighting the nuanced trade-offs between utility and trustworthiness, and consideration for various downstream applications, identifying persistent challenges and providing a strategic roadmap for future research. This work establishes a holistic framework for advancing trustworthiness in GenAI, paving the way for safer and more responsible integration of GenFMs into critical applications. To facilitate advancement in the community, we release the toolkit for dynamic evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14296v1-abstract-full').style.display = 'none'; document.getElementById('2502.14296v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14273">arXiv:2502.14273</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14273">pdf</a>, <a href="https://arxiv.org/format/2502.14273">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> LLM-EvRep: Learning an LLM-Compatible Event Representation Using a Self-Supervised Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Z">Zongyou Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+Q">Qiang Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+N">Nan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaoming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14273v1-abstract-short" style="display: inline;"> Recent advancements in event-based recognition have demonstrated significant promise, yet most existing approaches rely on extensive training, limiting their adaptability for efficient processing of event-driven visual content. Meanwhile, large language models (LLMs) have exhibited remarkable zero-shot capabilities across diverse domains, but their application to event-based visual recognition rem&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14273v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14273v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14273v1-abstract-full" style="display: none;"> Recent advancements in event-based recognition have demonstrated significant promise, yet most existing approaches rely on extensive training, limiting their adaptability for efficient processing of event-driven visual content. Meanwhile, large language models (LLMs) have exhibited remarkable zero-shot capabilities across diverse domains, but their application to event-based visual recognition remains largely unexplored. To bridge this gap, we propose \textbf{LLM-EvGen}, an event representation generator that produces LLM-compatible event representations \textbf{LLM-EvRep}, thereby enhancing the performance of LLMs on event recognition tasks. The generator is trained using a self-supervised framework, aligning the generated representations with semantic consistency and structural fidelity. Comprehensive experiments were conducted on three datasets: N-ImageNet, N-Caltech101, and N-MNIST. The results demonstrate that our method, \textbf{LLM-EvRep}, outperforms the event-to-video method, E2VID, by 15.93\%, 0.82\%, and 50.21\%, respectively, in recognition tasks when evaluated using GPT-4o. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14273v1-abstract-full').style.display = 'none'; document.getElementById('2502.14273v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures,Companion Proceedings of the ACM Web Conference 2025 (WWW Companion &#39;25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14260">arXiv:2502.14260</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14260">pdf</a>, <a href="https://arxiv.org/format/2502.14260">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EyeBench: A Call for More Rigorous Evaluation of Retinal Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+W">Wenhui Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xuanzhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yujian Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiwen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+P">Peijie Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Vasa%2C+V+K">Vamsi Krishna Vasa</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhangsihao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&amp;query=Dumitrascu%2C+O">Oana Dumitrascu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14260v1-abstract-short" style="display: inline;"> Over the past decade, generative models have achieved significant success in enhancement fundus images.However, the evaluation of these models still presents a considerable challenge. A comprehensive evaluation benchmark for fundus image enhancement is indispensable for three main reasons: 1) The existing denoising metrics (e.g., PSNR, SSIM) are hardly to extend to downstream real-world clinical r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14260v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14260v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14260v1-abstract-full" style="display: none;"> Over the past decade, generative models have achieved significant success in enhancement fundus images.However, the evaluation of these models still presents a considerable challenge. A comprehensive evaluation benchmark for fundus image enhancement is indispensable for three main reasons: 1) The existing denoising metrics (e.g., PSNR, SSIM) are hardly to extend to downstream real-world clinical research (e.g., Vessel morphology consistency). 2) There is a lack of comprehensive evaluation for both paired and unpaired enhancement methods, along with the need for expert protocols to accurately assess clinical value. 3) An ideal evaluation system should provide insights to inform future developments of fundus image enhancement. To this end, we propose a novel comprehensive benchmark, EyeBench, to provide insights that align enhancement models with clinical needs, offering a foundation for future work to improve the clinical relevance and applicability of generative models for fundus image enhancement. EyeBench has three appealing properties: 1) multi-dimensional clinical alignment downstream evaluation: In addition to evaluating the enhancement task, we provide several clinically significant downstream tasks for fundus images, including vessel segmentation, DR grading, denoising generalization, and lesion segmentation. 2) Medical expert-guided evaluation design: We introduce a novel dataset that promote comprehensive and fair comparisons between paired and unpaired methods and includes a manual evaluation protocol by medical experts. 3) Valuable insights: Our benchmark study provides a comprehensive and rigorous evaluation of existing methods across different downstream tasks, assisting medical experts in making informed choices. Additionally, we offer further analysis of the challenges faced by existing methods. The code is available at \url{https://github.com/Retinal-Research/EyeBench} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14260v1-abstract-full').style.display = 'none'; document.getElementById('2502.14260v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14063">arXiv:2502.14063</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14063">pdf</a>, <a href="https://arxiv.org/format/2502.14063">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PedDet: Adaptive Spectral Optimization for Multimodal Pedestrian Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yi Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wenxin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14063v2-abstract-short" style="display: inline;"> Pedestrian detection in intelligent transportation systems has made significant progress but faces two critical challenges: (1) insufficient fusion of complementary information between visible and infrared spectra, particularly in complex scenarios, and (2) sensitivity to illumination changes, such as low-light or overexposed conditions, leading to degraded performance. To address these issues, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14063v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14063v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14063v2-abstract-full" style="display: none;"> Pedestrian detection in intelligent transportation systems has made significant progress but faces two critical challenges: (1) insufficient fusion of complementary information between visible and infrared spectra, particularly in complex scenarios, and (2) sensitivity to illumination changes, such as low-light or overexposed conditions, leading to degraded performance. To address these issues, we propose PedDet, an adaptive spectral optimization complementarity framework specifically enhanced and optimized for multispectral pedestrian detection. PedDet introduces the Multi-scale Spectral Feature Perception Module (MSFPM) to adaptively fuse visible and infrared features, enhancing robustness and flexibility in feature extraction. Additionally, the Illumination Robustness Feature Decoupling Module (IRFDM) improves detection stability under varying lighting by decoupling pedestrian and background features. We further design a contrastive alignment to enhance intermodal feature discrimination. Experiments on LLVIP and MSDS datasets demonstrate that PedDet achieves state-of-the-art performance, improving the mAP by 6.6% with superior detection accuracy even in low-light conditions, marking a significant step forward for road safety. Code will be available at https://github.com/AIGeeksGroup/PedDet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14063v2-abstract-full').style.display = 'none'; document.getElementById('2502.14063v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13759">arXiv:2502.13759</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.13759">pdf</a>, <a href="https://arxiv.org/format/2502.13759">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Geolocation with Real Human Gameplay Data: A Large-Scale Dataset and Human-Like Reasoning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jingpu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yuan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tonglet%2C+J">Jonathan Tonglet</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+T">Tao Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Gurevych%2C+I">Iryna Gurevych</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13759v1-abstract-short" style="display: inline;"> Geolocation, the task of identifying an image&#39;s location, requires complex reasoning and is crucial for navigation, monitoring, and cultural preservation. However, current methods often produce coarse, imprecise, and non-interpretable localization. A major challenge lies in the quality and scale of existing geolocation datasets. These datasets are typically small-scale and automatically constructe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13759v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13759v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13759v1-abstract-full" style="display: none;"> Geolocation, the task of identifying an image&#39;s location, requires complex reasoning and is crucial for navigation, monitoring, and cultural preservation. However, current methods often produce coarse, imprecise, and non-interpretable localization. A major challenge lies in the quality and scale of existing geolocation datasets. These datasets are typically small-scale and automatically constructed, leading to noisy data and inconsistent task difficulty, with images that either reveal answers too easily or lack sufficient clues for reliable inference. To address these challenges, we introduce a comprehensive geolocation framework with three key components: GeoComp, a large-scale dataset; GeoCoT, a novel reasoning method; and GeoEval, an evaluation metric, collectively designed to address critical challenges and drive advancements in geolocation research. At the core of this framework is GeoComp (Geolocation Competition Dataset), a large-scale dataset collected from a geolocation game platform involving 740K users over two years. It comprises 25 million entries of metadata and 3 million geo-tagged locations spanning much of the globe, with each location annotated thousands to tens of thousands of times by human users. The dataset offers diverse difficulty levels for detailed analysis and highlights key gaps in current models. Building on this dataset, we propose Geographical Chain-of-Thought (GeoCoT), a novel multi-step reasoning framework designed to enhance the reasoning capabilities of Large Vision Models (LVMs) in geolocation tasks. GeoCoT improves performance by integrating contextual and spatial cues through a multi-step process that mimics human geolocation reasoning. Finally, using the GeoEval metric, we demonstrate that GeoCoT significantly boosts geolocation accuracy by up to 25% while enhancing interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13759v1-abstract-full').style.display = 'none'; document.getElementById('2502.13759v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Access dataset: https://huggingface.co/datasets/ShirohAO/tuxun</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13433">arXiv:2502.13433</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.13433">pdf</a>, <a href="https://arxiv.org/format/2502.13433">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MATS: An Audio Language Model under Text-only Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+R">Ruibing Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+H">Hong Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+S">Shiguang Shan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13433v2-abstract-short" style="display: inline;"> Large audio-language models (LALMs), built upon powerful Large Language Models (LLMs), have exhibited remarkable audio comprehension and reasoning capabilities. However, the training of LALMs demands a large corpus of audio-language pairs, which requires substantial costs in both data collection and training resources. In this paper, we propose MATS, an audio-language multimodal LLM designed to ha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13433v2-abstract-full').style.display = 'inline'; document.getElementById('2502.13433v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13433v2-abstract-full" style="display: none;"> Large audio-language models (LALMs), built upon powerful Large Language Models (LLMs), have exhibited remarkable audio comprehension and reasoning capabilities. However, the training of LALMs demands a large corpus of audio-language pairs, which requires substantial costs in both data collection and training resources. In this paper, we propose MATS, an audio-language multimodal LLM designed to handle Multiple Audio task using solely Text-only Supervision. By leveraging pre-trained audio-language alignment models such as CLAP, we develop a text-only training strategy that projects the shared audio-language latent space into LLM latent space, endowing the LLM with audio comprehension capabilities without relying on audio data during training. To further bridge the modality gap between audio and language embeddings within CLAP, we propose the Strongly-related noisy text with audio (Santa) mechanism. Santa maps audio embeddings into CLAP language embedding space while preserving essential information from the audio input. Extensive experiments demonstrate that MATS, despite being trained exclusively on text data, achieves competitive performance compared to recent LALMs trained on large-scale audio-language pairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13433v2-abstract-full').style.display = 'none'; document.getElementById('2502.13433v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages,11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13358">arXiv:2502.13358</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.13358">pdf</a>, <a href="https://arxiv.org/format/2502.13358">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Editing Gap in LLMs: FineEdit for Precise and Targeted Text Modifications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yiming Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wanhao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zexin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+T">Tao Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jinghan Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiyan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tingting Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13358v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have transformed natural language processing, yet they still struggle with direct text editing tasks that demand precise, context-aware modifications. While models like ChatGPT excel in text generation and analysis, their editing abilities often fall short, addressing only superficial issues rather than deeper structural or logical inconsistencies. In this work, we int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13358v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13358v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have transformed natural language processing, yet they still struggle with direct text editing tasks that demand precise, context-aware modifications. While models like ChatGPT excel in text generation and analysis, their editing abilities often fall short, addressing only superficial issues rather than deeper structural or logical inconsistencies. In this work, we introduce a dual approach to enhance LLMs editing performance. First, we present InstrEditBench, a high-quality benchmark dataset comprising over 20,000 structured editing tasks spanning Wiki articles, LaTeX documents, code, and database Domain-specific Languages (DSL). InstrEditBench is generated using an innovative automated workflow that accurately identifies and evaluates targeted edits, ensuring that modifications adhere strictly to specified instructions without altering unrelated content. Second, we propose FineEdit, a specialized model trained on this curated benchmark. Experimental results demonstrate that FineEdit achieves significant improvements around {10\%} compared with Gemini on direct editing tasks, convincingly validating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13358v1-abstract-full').style.display = 'none'; document.getElementById('2502.13358v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12988">arXiv:2502.12988</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12988">pdf</a>, <a href="https://arxiv.org/format/2502.12988">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Profile: From Surface-Level Facts to Deep Persona Simulation in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zixiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Duzhen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Agrawal%2C+I">Ishita Agrawal</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Shen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+L">Le Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12988v1-abstract-short" style="display: inline;"> Previous approaches to persona simulation large language models (LLMs) have typically relied on learning basic biographical information, or using limited role-play dialogue datasets to capture a character&#39;s responses. However, a holistic representation of an individual goes beyond surface-level facts or conversations to deeper thoughts and thinking. In this work, we introduce CharacterBot, a model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12988v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12988v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12988v1-abstract-full" style="display: none;"> Previous approaches to persona simulation large language models (LLMs) have typically relied on learning basic biographical information, or using limited role-play dialogue datasets to capture a character&#39;s responses. However, a holistic representation of an individual goes beyond surface-level facts or conversations to deeper thoughts and thinking. In this work, we introduce CharacterBot, a model designed to replicate both the linguistic patterns and distinctive thought processes of a character. Using Lu Xun, a renowned Chinese writer, as a case study, we propose four training tasks derived from his 17 essay collections. These include a pre-training task focused on mastering external linguistic structures and knowledge, as well as three fine-tuning tasks: multiple-choice question answering, generative question answering, and style transfer, each aligning the LLM with Lu Xun&#39;s internal ideation and writing style. To optimize learning across these tasks, we introduce a CharLoRA parameter updating mechanism, where a general linguistic style expert collaborates with other task-specific experts to better study both the language style and the understanding of deeper thoughts. We evaluate CharacterBot on three tasks for linguistic accuracy and opinion comprehension, demonstrating that it significantly outperforms the baselines on our adapted metrics. We hope that this work inspires future research on deep character persona simulation LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12988v1-abstract-full').style.display = 'none'; document.getElementById('2502.12988v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12874">arXiv:2502.12874</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12874">pdf</a>, <a href="https://arxiv.org/format/2502.12874">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Testing for Causal Fairness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fu%2C+J">Jiarun Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+L">LiZhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Pengqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Q">Qiuning Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yurong Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12874v1-abstract-short" style="display: inline;"> Causality is widely used in fairness analysis to prevent discrimination on sensitive attributes, such as genders in career recruitment and races in crime prediction. However, the current data-based Potential Outcomes Framework (POF) often leads to untrustworthy fairness analysis results when handling high-dimensional data. To address this, we introduce a distribution-based POF that transform fairn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12874v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12874v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12874v1-abstract-full" style="display: none;"> Causality is widely used in fairness analysis to prevent discrimination on sensitive attributes, such as genders in career recruitment and races in crime prediction. However, the current data-based Potential Outcomes Framework (POF) often leads to untrustworthy fairness analysis results when handling high-dimensional data. To address this, we introduce a distribution-based POF that transform fairness analysis into Distributional Closeness Testing (DCT) by intervening on sensitive attributes. We define counterfactual closeness fairness as the null hypothesis of DCT, where a sensitive attribute is considered fair if its factual and counterfactual potential outcome distributions are sufficiently close. We introduce the Norm-Adaptive Maximum Mean Discrepancy Treatment Effect (N-TE) as a statistic for measuring distributional closeness and apply DCT using the empirical estimator of NTE, referred to Counterfactual Fairness-CLOseness Testing ($\textrm{CF-CLOT}$). To ensure the trustworthiness of testing results, we establish the testing consistency of N-TE through rigorous theoretical analysis. $\textrm{CF-CLOT}$ demonstrates sensitivity in fairness analysis through the flexibility of the closeness parameter $蔚$. Unfair sensitive attributes have been successfully tested by $\textrm{CF-CLOT}$ in extensive experiments across various real-world scenarios, which validate the consistency of the testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12874v1-abstract-full').style.display = 'none'; document.getElementById('2502.12874v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12782">arXiv:2502.12782</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12782">pdf</a>, <a href="https://arxiv.org/format/2502.12782">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VidCapBench: A Comprehensive Benchmark of Video Captioning for Controllable Text-to-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinlong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+C">Chongling Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+Y">Yushuo Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fuzheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+C">Chengru Song</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12782v1-abstract-short" style="display: inline;"> The training of controllable text-to-video (T2V) models relies heavily on the alignment between videos and captions, yet little existing research connects video caption evaluation with T2V generation assessment. This paper introduces VidCapBench, a video caption evaluation scheme specifically designed for T2V generation, agnostic to any particular caption format. VidCapBench employs a data annotat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12782v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12782v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12782v1-abstract-full" style="display: none;"> The training of controllable text-to-video (T2V) models relies heavily on the alignment between videos and captions, yet little existing research connects video caption evaluation with T2V generation assessment. This paper introduces VidCapBench, a video caption evaluation scheme specifically designed for T2V generation, agnostic to any particular caption format. VidCapBench employs a data annotation pipeline, combining expert model labeling and human refinement, to associate each collected video with key information spanning video aesthetics, content, motion, and physical laws. VidCapBench then partitions these key information attributes into automatically assessable and manually assessable subsets, catering to both the rapid evaluation needs of agile development and the accuracy requirements of thorough validation. By evaluating numerous state-of-the-art captioning models, we demonstrate the superior stability and comprehensiveness of VidCapBench compared to existing video captioning evaluation approaches. Verification with off-the-shelf T2V models reveals a significant positive correlation between scores on VidCapBench and the T2V quality evaluation metrics, indicating that VidCapBench can provide valuable guidance for training T2V models. The project is available at https://github.com/VidCapBench/VidCapBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12782v1-abstract-full').style.display = 'none'; document.getElementById('2502.12782v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12671">arXiv:2502.12671</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12671">pdf</a>, <a href="https://arxiv.org/format/2502.12671">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-M1: Pushing the Medical Capability of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Haizhou Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Huozhi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+L">Liang Song</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mingyu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+X">Xiangrong Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yupeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zecheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhengyun Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+D">Da Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Kou%2C+F">Fei Kou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+F">Fuzhong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Han Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongda Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jin He</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jinjie Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+K">Kangxi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+K">Kegeng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+L">Lei Su</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+L">Linlin Niu</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12671v1-abstract-short" style="display: inline;"> The current generation of large language models (LLMs) is typically designed for broad, general-purpose applications, while domain-specific LLMs, especially in vertical fields like medicine, remain relatively scarce. In particular, the development of highly efficient and practical LLMs for the medical domain is challenging due to the complexity of medical knowledge and the limited availability of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12671v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12671v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12671v1-abstract-full" style="display: none;"> The current generation of large language models (LLMs) is typically designed for broad, general-purpose applications, while domain-specific LLMs, especially in vertical fields like medicine, remain relatively scarce. In particular, the development of highly efficient and practical LLMs for the medical domain is challenging due to the complexity of medical knowledge and the limited availability of high-quality data. To bridge this gap, we introduce Baichuan-M1, a series of large language models specifically optimized for medical applications. Unlike traditional approaches that simply continue pretraining on existing models or apply post-training to a general base model, Baichuan-M1 is trained from scratch with a dedicated focus on enhancing medical capabilities. Our model is trained on 20 trillion tokens and incorporates a range of effective training methods that strike a balance between general capabilities and medical expertise. As a result, Baichuan-M1 not only performs strongly across general domains such as mathematics and coding but also excels in specialized medical fields. We have open-sourced Baichuan-M1-14B, a mini version of our model, which can be accessed through the following links. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12671v1-abstract-full').style.display = 'none'; document.getElementById('2502.12671v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12568">arXiv:2502.12568</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12568">pdf</a>, <a href="https://arxiv.org/format/2502.12568">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Cognitive Writing Perspective for Constrained Long-Form Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+K">Kaiyang Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+H">Honglin Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+R">Rui Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Haoran Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+T">Tianle Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12568v2-abstract-short" style="display: inline;"> Like humans, Large Language Models (LLMs) struggle to generate high-quality long-form text that adheres to strict requirements in a single pass. This challenge is unsurprising, as successful human writing, according to the Cognitive Writing Theory, is a complex cognitive process involving iterative planning, translating, reviewing, and monitoring. Motivated by these cognitive principles, we aim to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12568v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12568v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12568v2-abstract-full" style="display: none;"> Like humans, Large Language Models (LLMs) struggle to generate high-quality long-form text that adheres to strict requirements in a single pass. This challenge is unsurprising, as successful human writing, according to the Cognitive Writing Theory, is a complex cognitive process involving iterative planning, translating, reviewing, and monitoring. Motivated by these cognitive principles, we aim to equip LLMs with human-like cognitive writing capabilities through CogWriter, a novel training-free framework that transforms LLM constrained long-form text generation into a systematic cognitive writing paradigm. Our framework consists of two key modules: (1) a Planning Agent that performs hierarchical planning to decompose the task, and (2) multiple Generation Agents that execute these plans in parallel. The system maintains quality via continuous monitoring and reviewing mechanisms, which evaluate outputs against specified requirements and trigger necessary revisions. CogWriter demonstrates exceptional performance on LongGenBench, a benchmark for complex constrained long-form text generation. Even when using Qwen-2.5-14B as its backbone, CogWriter surpasses GPT-4o by 22% in complex instruction completion accuracy while reliably generating texts exceeding 10,000 words. We hope this cognitive science-inspired approach provides a paradigm for LLM writing advancements: \href{https://github.com/KaiyangWan/CogWriter}{CogWriter}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12568v2-abstract-full').style.display = 'none'; document.getElementById('2502.12568v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12566">arXiv:2502.12566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12566">pdf</a>, <a href="https://arxiv.org/format/2502.12566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Impact of Personality Traits on LLM Bias and Toxicity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Renhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yulin Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+D+F">Derek F. Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12566v2-abstract-short" style="display: inline;"> With the different roles that AI is expected to play in human life, imbuing large language models (LLMs) with different personalities has attracted increasing research interests. While the &#34;personification&#34; enhances human experiences of interactivity and adaptability of LLMs, it gives rise to critical concerns about content safety, particularly regarding bias, sentiment and toxicity of LLM generat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12566v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12566v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12566v2-abstract-full" style="display: none;"> With the different roles that AI is expected to play in human life, imbuing large language models (LLMs) with different personalities has attracted increasing research interests. While the &#34;personification&#34; enhances human experiences of interactivity and adaptability of LLMs, it gives rise to critical concerns about content safety, particularly regarding bias, sentiment and toxicity of LLM generation. This study explores how assigning different personality traits to LLMs affects the toxicity and biases of their outputs. Leveraging the widely accepted HEXACO personality framework developed in social psychology, we design experimentally sound prompts to test three LLMs&#39; performance on three toxic and bias benchmarks. The findings demonstrate the sensitivity of all three models to HEXACO personality traits and, more importantly, a consistent variation in the biases, negative sentiment and toxicity of their output. In particular, adjusting the levels of several personality traits can effectively reduce bias and toxicity in model performance, similar to humans&#39; correlations between personality traits and toxic behaviors. The findings highlight the additional need to examine content safety besides the efficiency of training or fine-tuning methods for LLM personification. They also suggest a potential for the adjustment of personalities to be a simple and low-cost method to conduct controlled text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12566v2-abstract-full').style.display = 'none'; document.getElementById('2502.12566v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12450">arXiv:2502.12450</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12450">pdf</a>, <a href="https://arxiv.org/format/2502.12450">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Investigating and Extending Homans&#39; Social Exchange Theory with Large Language Model based Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zheqing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12450v1-abstract-short" style="display: inline;"> Homans&#39; Social Exchange Theory (SET) is widely recognized as a basic framework for understanding the formation and emergence of human civilizations and social structures. In social science, this theory is typically studied based on simple simulation experiments or real-world human studies, both of which either lack realism or are too expensive to control. In artificial intelligence, recent advance&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12450v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12450v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12450v1-abstract-full" style="display: none;"> Homans&#39; Social Exchange Theory (SET) is widely recognized as a basic framework for understanding the formation and emergence of human civilizations and social structures. In social science, this theory is typically studied based on simple simulation experiments or real-world human studies, both of which either lack realism or are too expensive to control. In artificial intelligence, recent advances in large language models (LLMs) have shown promising capabilities in simulating human behaviors. Inspired by these insights, we adopt an interdisciplinary research perspective and propose using LLM-based agents to study Homans&#39; SET. Specifically, we construct a virtual society composed of three LLM agents and have them engage in a social exchange game to observe their behaviors. Through extensive experiments, we found that Homans&#39; SET is well validated in our agent society, demonstrating the consistency between the agent and human behaviors. Building on this foundation, we intentionally alter the settings of the agent society to extend the traditional Homans&#39; SET, making it more comprehensive and detailed. To the best of our knowledge, this paper marks the first step in studying Homans&#39; SET with LLM-based agents. More importantly, it introduces a novel and feasible research paradigm that bridges the fields of social science and computer science through LLM-based agents. Code is available at https://github.com/Paitesanshi/SET. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12450v1-abstract-full').style.display = 'none'; document.getElementById('2502.12450v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12442">arXiv:2502.12442</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12442">pdf</a>, <a href="https://arxiv.org/format/2502.12442">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HopRAG: Multi-Hop Reasoning for Logic-Aware Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhengren Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Q">Qinhan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12442v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) systems often struggle with imperfect retrieval, as traditional retrievers focus on lexical or semantic similarity rather than logical relevance. To address this, we propose HopRAG, a novel RAG framework that augments retrieval with logical reasoning through graph-structured knowledge exploration. During indexing, HopRAG constructs a passage graph, with text ch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12442v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12442v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12442v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) systems often struggle with imperfect retrieval, as traditional retrievers focus on lexical or semantic similarity rather than logical relevance. To address this, we propose HopRAG, a novel RAG framework that augments retrieval with logical reasoning through graph-structured knowledge exploration. During indexing, HopRAG constructs a passage graph, with text chunks as vertices and logical connections established via LLM-generated pseudo-queries as edges. During retrieval, it employs a retrieve-reason-prune mechanism: starting with lexically or semantically similar passages, the system explores multi-hop neighbors guided by pseudo-queries and LLM reasoning to identify truly relevant ones. Extensive experiments demonstrate HopRAG&#39;s superiority, achieving 76.78\% higher answer accuracy and 65.07\% improved retrieval F1 score compared to conventional methods. The repository is available at https://github.com/LIU-Hao-2002/HopRAG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12442v1-abstract-full').style.display = 'none'; document.getElementById('2502.12442v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12411">arXiv:2502.12411</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.12411">pdf</a>, <a href="https://arxiv.org/format/2502.12411">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gradient Co-occurrence Analysis for Detecting Unsafe Prompts in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jingyuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+B">Bowen Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Rongjun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Ziyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Z">Zhiyong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+W">Wei Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12411v1-abstract-short" style="display: inline;"> Unsafe prompts pose significant safety risks to large language models (LLMs). Existing methods for detecting unsafe prompts rely on data-driven fine-tuning to train guardrail models, necessitating significant data and computational resources. In contrast, recent few-shot gradient-based methods emerge, requiring only few safe and unsafe reference prompts. A gradient-based approach identifies unsafe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12411v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12411v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12411v1-abstract-full" style="display: none;"> Unsafe prompts pose significant safety risks to large language models (LLMs). Existing methods for detecting unsafe prompts rely on data-driven fine-tuning to train guardrail models, necessitating significant data and computational resources. In contrast, recent few-shot gradient-based methods emerge, requiring only few safe and unsafe reference prompts. A gradient-based approach identifies unsafe prompts by analyzing consistent patterns of the gradients of safety-critical parameters in LLMs. Although effective, its restriction to directional similarity (cosine similarity) introduces ``directional bias&#39;&#39;, limiting its capability to identify unsafe prompts. To overcome this limitation, we introduce GradCoo, a novel gradient co-occurrence analysis method that expands the scope of safety-critical parameter identification to include unsigned gradient similarity, thereby reducing the impact of ``directional bias&#39;&#39; and enhancing the accuracy of unsafe prompt detection. Comprehensive experiments on the widely-used benchmark datasets ToxicChat and XStest demonstrate that our proposed method can achieve state-of-the-art (SOTA) performance compared to existing methods. Moreover, we confirm the generalizability of GradCoo in detecting unsafe prompts across a range of LLM base models with various sizes and origins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12411v1-abstract-full').style.display = 'none'; document.getElementById('2502.12411v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+A">Ailin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+B">Boyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bruce Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+C">Chao Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+C">Chengli Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+F">Fei Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jingbei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+W">Wang You</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Brian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+C">Chengting Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11767">arXiv:2502.11767</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11767">pdf</a>, <a href="https://arxiv.org/format/2502.11767">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Selection to Generation: A Survey of LLM-based Active Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yu Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Mukherjee%2C+S">Subhojyoti Mukherjee</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Z">Zhouhang Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xintong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Aponte%2C+R">Ryan Aponte</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+H">Hanjia Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Barrow%2C+J">Joe Barrow</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongjie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Kveton%2C+B">Branislav Kveton</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmed%2C+N+K">Nesreen K. Ahmed</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Deilamsalehy%2C+H">Hanieh Deilamsalehy</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Z">Zhengmian Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Lipka%2C+N">Nedim Lipka</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+S">Seunghyun Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+T+K">Ting-Hao Kenneth Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11767v1-abstract-short" style="display: inline;"> Active Learning (AL) has been a powerful paradigm for improving model efficiency and performance by selecting the most informative data points for labeling and training. In recent active learning frameworks, Large Language Models (LLMs) have been employed not only for selection but also for generating entirely new data instances and providing more cost-effective annotations. Motivated by the incre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11767v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11767v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11767v1-abstract-full" style="display: none;"> Active Learning (AL) has been a powerful paradigm for improving model efficiency and performance by selecting the most informative data points for labeling and training. In recent active learning frameworks, Large Language Models (LLMs) have been employed not only for selection but also for generating entirely new data instances and providing more cost-effective annotations. Motivated by the increasing importance of high-quality data and efficient model training in the era of LLMs, we present a comprehensive survey on LLM-based Active Learning. We introduce an intuitive taxonomy that categorizes these techniques and discuss the transformative roles LLMs can play in the active learning loop. We further examine the impact of AL on LLM learning paradigms and its applications across various domains. Finally, we identify open challenges and propose future research directions. This survey aims to serve as an up-to-date resource for researchers and practitioners seeking to gain an intuitive understanding of LLM-based AL techniques and deploy them to new applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11767v1-abstract-full').style.display = 'none'; document.getElementById('2502.11767v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11721">arXiv:2502.11721</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11721">pdf</a>, <a href="https://arxiv.org/format/2502.11721">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Recommendation Explanations through User-Centric Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingsen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Z">Zihang Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xueyang Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11721v1-abstract-short" style="display: inline;"> Generating natural language explanations for recommendations has become increasingly important in recommender systems. Traditional approaches typically treat user reviews as ground truth for explanations and focus on improving review prediction accuracy by designing various model architectures. However, due to limitations in data scale and model capability, these explanations often fail to meet ke&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11721v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11721v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11721v1-abstract-full" style="display: none;"> Generating natural language explanations for recommendations has become increasingly important in recommender systems. Traditional approaches typically treat user reviews as ground truth for explanations and focus on improving review prediction accuracy by designing various model architectures. However, due to limitations in data scale and model capability, these explanations often fail to meet key user-centric aspects such as factuality, personalization, and sentiment coherence, significantly reducing their overall helpfulness to users. In this paper, we propose a novel paradigm that refines initial explanations generated by existing explainable recommender models during the inference stage to enhance their quality in multiple aspects. Specifically, we introduce a multi-agent collaborative refinement framework based on large language models. To ensure alignment between the refinement process and user demands, we employ a plan-then-refine pattern to perform targeted modifications. To enable continuous improvements, we design a hierarchical reflection mechanism that provides feedback on the refinement process from both strategic and content perspectives. Extensive experiments on three datasets demonstrate the effectiveness of our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11721v1-abstract-full').style.display = 'none'; document.getElementById('2502.11721v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11664">arXiv:2502.11664</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11664">pdf</a>, <a href="https://arxiv.org/format/2502.11664">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VRoPE: Rotary Position Embedding for Video Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zikang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+L">Longteng Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yepeng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+J">Junxian Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+K">Kai Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11664v1-abstract-short" style="display: inline;"> Rotary Position Embedding (RoPE) has shown strong performance in text-based Large Language Models (LLMs), but extending it to video remains a challenge due to the intricate spatiotemporal structure of video frames. Existing adaptations, such as RoPE-3D, attempt to encode spatial and temporal dimensions separately but suffer from two major limitations: positional bias in attention distribution and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11664v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11664v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11664v1-abstract-full" style="display: none;"> Rotary Position Embedding (RoPE) has shown strong performance in text-based Large Language Models (LLMs), but extending it to video remains a challenge due to the intricate spatiotemporal structure of video frames. Existing adaptations, such as RoPE-3D, attempt to encode spatial and temporal dimensions separately but suffer from two major limitations: positional bias in attention distribution and disruptions in video-text transitions. To overcome these issues, we propose Video Rotary Position Embedding (VRoPE), a novel positional encoding method tailored for Video-LLMs. Our approach restructures positional indices to preserve spatial coherence and ensure a smooth transition between video and text tokens. Additionally, we introduce a more balanced encoding strategy that mitigates attention biases, ensuring a more uniform distribution of spatial focus. Extensive experiments on Vicuna and Qwen2 across different model scales demonstrate that VRoPE consistently outperforms previous RoPE variants, achieving significant improvements in video understanding, temporal reasoning, and retrieval tasks. Code will be available at https://github.com/johncaged/VRoPE <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11664v1-abstract-full').style.display = 'none'; document.getElementById('2502.11664v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11609">arXiv:2502.11609</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11609">pdf</a>, <a href="https://arxiv.org/format/2502.11609">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Task Relationships for Continual Learning Using Transferability-Aware Task Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yanru Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jianning Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+E">Enming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hanbing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11609v1-abstract-short" style="display: inline;"> Continual learning (CL) has been an essential topic in the contemporary application of deep neural networks, where catastrophic forgetting (CF) can impede a model&#39;s ability to acquire knowledge progressively. Existing CL strategies primarily address CF by regularizing model updates or separating task-specific and shared components. However, these methods focus on task model elements while overlook&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11609v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11609v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11609v1-abstract-full" style="display: none;"> Continual learning (CL) has been an essential topic in the contemporary application of deep neural networks, where catastrophic forgetting (CF) can impede a model&#39;s ability to acquire knowledge progressively. Existing CL strategies primarily address CF by regularizing model updates or separating task-specific and shared components. However, these methods focus on task model elements while overlooking the potential of leveraging inter-task relationships for learning enhancement. To address this, we propose a transferability-aware task embedding named H-embedding and train a hypernet under its guidance to learn task-conditioned model weights for CL tasks. Particularly, H-embedding is introduced based on an information theoretical transferability measure and is designed to be online and easy to compute. The framework is also characterized by notable practicality, which only requires storing a low-dimensional task embedding for each task, and can be efficiently trained in an end-to-end way. Extensive evaluations and experimental analyses on datasets including Permuted MNIST, Cifar10/100, and ImageNet-R demonstrate that our framework performs prominently compared to various baseline methods, displaying great potential in exploiting intrinsic task relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11609v1-abstract-full').style.display = 'none'; document.getElementById('2502.11609v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11586">arXiv:2502.11586</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11586">pdf</a>, <a href="https://arxiv.org/format/2502.11586">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Syllables to Scenes: Literary-Guided Free-Viewpoint 3D Scene Synthesis from Japanese Haiku </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chunan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Y">Yidong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+C">Chaotao Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zang%2C+Y">Ying Zang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinhao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zejian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Renjun Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tianrun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11586v1-abstract-short" style="display: inline;"> In the era of the metaverse, where immersive technologies redefine human experiences, translating abstract literary concepts into navigable 3D environments presents a fundamental challenge in preserving semantic and emotional fidelity. This research introduces HaikuVerse, a novel framework for transforming poetic abstraction into spatial representation, with Japanese Haiku serving as an ideal test&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11586v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11586v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11586v1-abstract-full" style="display: none;"> In the era of the metaverse, where immersive technologies redefine human experiences, translating abstract literary concepts into navigable 3D environments presents a fundamental challenge in preserving semantic and emotional fidelity. This research introduces HaikuVerse, a novel framework for transforming poetic abstraction into spatial representation, with Japanese Haiku serving as an ideal test case due to its sophisticated encapsulation of profound emotions and imagery within minimal text. While existing text-to-3D methods struggle with nuanced interpretations, we present a literary-guided approach that synergizes traditional poetry analysis with advanced generative technologies. Our framework centers on two key innovations: (1) Hierarchical Literary-Criticism Theory Grounded Parsing (H-LCTGP), which captures both explicit imagery and implicit emotional resonance through structured semantic decomposition, and (2) Progressive Dimensional Synthesis (PDS), a multi-stage pipeline that systematically transforms poetic elements into coherent 3D scenes through sequential diffusion processes, geometric optimization, and real-time enhancement. Extensive experiments demonstrate that HaikuVerse significantly outperforms conventional text-to-3D approaches in both literary fidelity and visual quality, establishing a new paradigm for preserving cultural heritage in immersive digital spaces. Project website at: https://syllables-to-scenes.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11586v1-abstract-full').style.display = 'none'; document.getElementById('2502.11586v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 11 figures, submitted to IJCAI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11453">arXiv:2502.11453</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11453">pdf</a>, <a href="https://arxiv.org/format/2502.11453">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Connector-S: A Survey of Connectors in Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yiming Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Miao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Ji Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11453v1-abstract-short" style="display: inline;"> With the rapid advancements in multi-modal large language models (MLLMs), connectors play a pivotal role in bridging diverse modalities and enhancing model performance. However, the design and evolution of connectors have not been comprehensively analyzed, leaving gaps in understanding how these components function and hindering the development of more powerful connectors. In this survey, we syste&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11453v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11453v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11453v1-abstract-full" style="display: none;"> With the rapid advancements in multi-modal large language models (MLLMs), connectors play a pivotal role in bridging diverse modalities and enhancing model performance. However, the design and evolution of connectors have not been comprehensively analyzed, leaving gaps in understanding how these components function and hindering the development of more powerful connectors. In this survey, we systematically review the current progress of connectors in MLLMs and present a structured taxonomy that categorizes connectors into atomic operations (mapping, compression, mixture of experts) and holistic designs (multi-layer, multi-encoder, multi-modal scenarios), highlighting their technical contributions and advancements. Furthermore, we discuss several promising research frontiers and challenges, including high-resolution input, dynamic compression, guide information selection, combination strategy, and interpretability. This survey is intended to serve as a foundational reference and a clear roadmap for researchers, providing valuable insights into the design and optimization of next-generation connectors to enhance the performance and adaptability of MLLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11453v1-abstract-full').style.display = 'none'; document.getElementById('2502.11453v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11435">arXiv:2502.11435</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11435">pdf</a>, <a href="https://arxiv.org/format/2502.11435">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SMART: Self-Aware Agent for Tool Overuse Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qian%2C+C">Cheng Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Acikgoz%2C+E+C">Emre Can Acikgoz</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongru Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiusi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&amp;query=Hakkani-T%C3%BCr%2C+D">Dilek Hakkani-T眉r</a>, <a href="/search/cs?searchtype=author&amp;query=Tur%2C+G">Gokhan Tur</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+H">Heng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11435v1-abstract-short" style="display: inline;"> Current Large Language Model (LLM) agents demonstrate strong reasoning and tool use capabilities, but often lack self-awareness, failing to balance these approaches effectively. This imbalance leads to Tool Overuse, where models unnecessarily rely on external tools for tasks solvable with parametric knowledge, increasing computational overhead. Inspired by human metacognition, we introduce SMART (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11435v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11435v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11435v1-abstract-full" style="display: none;"> Current Large Language Model (LLM) agents demonstrate strong reasoning and tool use capabilities, but often lack self-awareness, failing to balance these approaches effectively. This imbalance leads to Tool Overuse, where models unnecessarily rely on external tools for tasks solvable with parametric knowledge, increasing computational overhead. Inspired by human metacognition, we introduce SMART (Strategic Model-Aware Reasoning with Tools), a paradigm that enhances an agent&#39;s self-awareness to optimize task handling and reduce tool overuse. To support this paradigm, we introduce SMART-ER, a dataset spanning three domains, where reasoning alternates between parametric knowledge and tool-dependent steps, with each step enriched by rationales explaining when tools are necessary. Through supervised training, we develop SMARTAgent, a family of models that dynamically balance parametric knowledge and tool use. Evaluations show that SMARTAgent reduces tool use by 24% while improving performance by over 37%, enabling 7B-scale models to match its 70B counterpart and GPT-4o. Additionally, SMARTAgent generalizes to out-of-distribution test data like GSM8K and MINTQA, maintaining accuracy with just one-fifth the tool calls. These highlight the potential of strategic tool use to enhance reasoning, mitigate overuse, and bridge the gap between model size and performance, advancing intelligent and resource-efficient agent designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11435v1-abstract-full').style.display = 'none'; document.getElementById('2502.11435v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 tables, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11404">arXiv:2502.11404</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11404">pdf</a>, <a href="https://arxiv.org/format/2502.11404">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ToolCoder: A Systematic Code-Empowered Tool Learning Framework for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ding%2C+H">Hanxing Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+S">Shuchang Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+L">Liang Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Zihao Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jinyang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+B">Bolin Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xueqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11404v1-abstract-short" style="display: inline;"> Tool learning has emerged as a crucial capability for large language models (LLMs) to solve complex real-world tasks through interaction with external tools. Existing approaches face significant challenges, including reliance on hand-crafted prompts, difficulty in multi-step planning, and lack of precise error diagnosis and reflection mechanisms. We propose ToolCoder, a novel framework that reform&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11404v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11404v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11404v1-abstract-full" style="display: none;"> Tool learning has emerged as a crucial capability for large language models (LLMs) to solve complex real-world tasks through interaction with external tools. Existing approaches face significant challenges, including reliance on hand-crafted prompts, difficulty in multi-step planning, and lack of precise error diagnosis and reflection mechanisms. We propose ToolCoder, a novel framework that reformulates tool learning as a code generation task. Inspired by software engineering principles, ToolCoder transforms natural language queries into structured Python function scaffold and systematically breaks down tasks with descriptive comments, enabling LLMs to leverage coding paradigms for complex reasoning and planning. It then generates and executes function implementations to obtain final responses. Additionally, ToolCoder stores successfully executed functions in a repository to promote code reuse, while leveraging error traceback mechanisms for systematic debugging, optimizing both execution efficiency and robustness. Experiments demonstrate that ToolCoder achieves superior performance in task completion accuracy and execution reliability compared to existing approaches, establishing the effectiveness of code-centric approaches in tool learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11404v1-abstract-full').style.display = 'none'; document.getElementById('2502.11404v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11307">arXiv:2502.11307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11307">pdf</a>, <a href="https://arxiv.org/format/2502.11307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Point-Language Models with Dual-Prompts for 3D Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiaxiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haote Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaolu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haodi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+X">Xinghao Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+X">Xiaotong Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11307v1-abstract-short" style="display: inline;"> Anomaly detection (AD) in 3D point clouds is crucial in a wide range of industrial applications, especially in various forms of precision manufacturing. Considering the industrial demand for reliable 3D AD, several methods have been developed. However, most of these approaches typically require training separate models for each category, which is memory-intensive and lacks flexibility. In this pap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11307v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11307v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11307v1-abstract-full" style="display: none;"> Anomaly detection (AD) in 3D point clouds is crucial in a wide range of industrial applications, especially in various forms of precision manufacturing. Considering the industrial demand for reliable 3D AD, several methods have been developed. However, most of these approaches typically require training separate models for each category, which is memory-intensive and lacks flexibility. In this paper, we propose a novel Point-Language model with dual-prompts for 3D ANomaly dEtection (PLANE). The approach leverages multi-modal prompts to extend the strong generalization capabilities of pre-trained Point-Language Models (PLMs) to the domain of 3D point cloud AD, achieving impressive detection performance across multiple categories using a single model. Specifically, we propose a dual-prompt learning method, incorporating both text and point cloud prompts. The method utilizes a dynamic prompt creator module (DPCM) to produce sample-specific dynamic prompts, which are then integrated with class-specific static prompts for each modality, effectively driving the PLMs. Additionally, based on the characteristics of point cloud data, we propose a pseudo 3D anomaly generation method (Ano3D) to improve the model&#39;s detection capabilities in an unsupervised setting. Experimental results demonstrate that the proposed method, which is under the multi-class-one-model paradigm, achieves a +8.7%/+17% gain on anomaly detection and localization performance as compared to the state-of-the-art one-class-one-model methods for the Anomaly-ShapeNet dataset, and obtains +4.3%/+4.1% gain for the Real3D-AD dataset. Code will be available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11307v1-abstract-full').style.display = 'none'; document.getElementById('2502.11307v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11147">arXiv:2502.11147</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11147">pdf</a>, <a href="https://arxiv.org/format/2502.11147">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Long-Decoding Inference with Reasoning-Aware Attention Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Junhao Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenrui Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhenwen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+T">Tiancheng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhixia Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xusheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+T">Tao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+Y">Yizhou Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11147v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated strong capabilities across various domains, with recent advancements in challenging reasoning tasks such as mathematics and programming. However, solving reasoning tasks often requires long decoding chains (of thoughts), which incur $O(N)$ time and memory consumption, where $N$ is the chain length. To mitigate $O(N)$ time and memory consumption, exist&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11147v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11147v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11147v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated strong capabilities across various domains, with recent advancements in challenging reasoning tasks such as mathematics and programming. However, solving reasoning tasks often requires long decoding chains (of thoughts), which incur $O(N)$ time and memory consumption, where $N$ is the chain length. To mitigate $O(N)$ time and memory consumption, existing sparsity-based algorithms propose retaining only the most critical token&#39;s intermediate data (i.e., key-value cache) and discarding the rest. However, these existing algorithms struggle with the ``impossible trinity&#39;&#39; of accuracy, time, and memory. For example, the state-of-the-art algorithm, Quest, achieves high accuracy with $O(L)$ time but $O(N)$ memory ($L$ is the cache budget, $L \ll N$). To address this issue, in this paper, we identify a new attention pattern during the decode stage of reasoning tasks, where milestone tokens (analogous to lemmas in mathematical proofs) emerge, are utilized, and then become unimportant afterward. Based on this pattern, we propose a new algorithm named RaaS that identifies and retains milestone tokens only until they are no longer needed, achieving high accuracy with $O(L)$ time and $O(L)$ memory complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11147v1-abstract-full').style.display = 'none'; document.getElementById('2502.11147v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11075">arXiv:2502.11075</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11075">pdf</a>, <a href="https://arxiv.org/format/2502.11075">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exposing Numeracy Gaps: A Benchmark to Evaluate Fundamental Numerical Abilities in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuejia Chen</a>, <a href="/search/cs?searchtype=author&amp;query=XU%2C+Z">Zhanchao XU</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Darian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+N">Nicole Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Teng%2C+F">Fei Teng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+L">Luyu Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C+J">Chen Jason Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11075v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive capabilities in natural language processing tasks, such as text generation and semantic understanding. However, their performance on numerical reasoning tasks, such as basic arithmetic, numerical retrieval, and magnitude comparison, remains surprisingly poor. This gap arises from their reliance on surface-level statistical patterns rather t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11075v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11075v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11075v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive capabilities in natural language processing tasks, such as text generation and semantic understanding. However, their performance on numerical reasoning tasks, such as basic arithmetic, numerical retrieval, and magnitude comparison, remains surprisingly poor. This gap arises from their reliance on surface-level statistical patterns rather than understanding numbers as continuous magnitudes. Existing benchmarks primarily focus on either linguistic competence or structured mathematical problem-solving, neglecting fundamental numerical reasoning required in real-world scenarios. To bridge this gap, we propose NumericBench, a comprehensive benchmark to evaluate six fundamental numerical capabilities: number recognition, arithmetic operations, contextual retrieval, comparison, summary, and logical reasoning. NumericBench includes datasets ranging from synthetic number lists to the crawled real-world data, addressing challenges like long contexts, noise, and multi-step reasoning. Extensive experiments on state-of-the-art LLMs, including GPT-4 and DeepSeek, reveal persistent weaknesses in numerical reasoning, highlighting the urgent need to improve numerically-aware language modeling. The benchmark is released in: https://github.com/TreeAI-Lab/NumericBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11075v1-abstract-full').style.display = 'none'; document.getElementById('2502.11075v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10708">arXiv:2502.10708</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.10708">pdf</a>, <a href="https://arxiv.org/format/2502.10708">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Injecting Domain-Specific Knowledge into Large Language Models: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+B">Bin Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuhan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+M">Miao Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingzhe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10708v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable success in various tasks such as natural language understanding, text summarization, and machine translation. However, their general-purpose nature often limits their effectiveness in domain-specific applications that require specialized knowledge, such as healthcare, chemistry, or legal analysis. To address this, researchers have explored&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10708v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10708v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10708v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable success in various tasks such as natural language understanding, text summarization, and machine translation. However, their general-purpose nature often limits their effectiveness in domain-specific applications that require specialized knowledge, such as healthcare, chemistry, or legal analysis. To address this, researchers have explored diverse methods to enhance LLMs by integrating domain-specific knowledge. In this survey, we provide a comprehensive overview of these methods, which we categorize into four key approaches: dynamic knowledge injection, static knowledge embedding, modular adapters, and prompt optimization. Each approach offers unique mechanisms to equip LLMs with domain expertise, balancing trade-offs between flexibility, scalability, and efficiency. We discuss how these methods enable LLMs to tackle specialized tasks, compare their advantages and disadvantages, evaluate domain-specific LLMs against general LLMs, and highlight the challenges and opportunities in this emerging field. For those interested in delving deeper into this area, we also summarize the commonly used datasets and benchmarks. To keep researchers updated on the latest studies, we maintain an open-source at: https://github.com/abilliyb/Knowledge_Injection_Survey_Papers, dedicated to documenting research in the field of specialized LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10708v1-abstract-full').style.display = 'none'; document.getElementById('2502.10708v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10596">arXiv:2502.10596</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.10596">pdf</a>, <a href="https://arxiv.org/format/2502.10596">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Post-training an LLM for RAG? Train on Self-Generated Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Finlayson%2C+M">Matthew Finlayson</a>, <a href="/search/cs?searchtype=author&amp;query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&amp;query=Bikel%2C+D+M">Daneil M. Bikel</a>, <a href="/search/cs?searchtype=author&amp;query=Oguz%2C+B">Barlas Oguz</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xilun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pappu%2C+A">Aasish Pappu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10596v1-abstract-short" style="display: inline;"> Large language models (LLMs) often struggle with knowledge intensive NLP tasks, such as answering &#34;Who won the latest World Cup?&#34; because the knowledge they learn during training may be insufficient or outdated. Conditioning generation on retrieved documents -- a technique known as retrieval augmented generation (RAG) -- mitigates these shortcomings by allowing the model to leverage in-context inf&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10596v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10596v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10596v1-abstract-full" style="display: none;"> Large language models (LLMs) often struggle with knowledge intensive NLP tasks, such as answering &#34;Who won the latest World Cup?&#34; because the knowledge they learn during training may be insufficient or outdated. Conditioning generation on retrieved documents -- a technique known as retrieval augmented generation (RAG) -- mitigates these shortcomings by allowing the model to leverage in-context information. Practitioners can improve LLM RAG performance by fine-tuning on retrieval-augmented instructions, but must beware that this can cause undesirable model behaviors like hallucinations. We attribute this degradation to the fact that the training data is likely to be out-of-distribution for the model and may suffer from quality issues, such as misalignment between retrievals and target responses (since retrievals are frequently added post-hoc). We propose a recipe for training RAG-enabled LLMs using self-generated demonstrations, thereby avoiding training on out-of-distribution text and integrating retrievals into the LLM responses. We evaluate our method on knowledge intensive question answering (QA) tasks and show that our method teaches LLMs to properly handle in-context retrievals and abstain from questions it will likely get wrong. Compared to conventional RA-IT methods, our method prevents model degradation in non-RAG settings while exhibiting superior QA performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10596v1-abstract-full').style.display = 'none'; document.getElementById('2502.10596v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v3-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v3-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v3-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V&#39;s performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v3-abstract-full').style.display = 'none'; document.getElementById('2502.10248v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10203">arXiv:2502.10203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.10203">pdf</a>, <a href="https://arxiv.org/format/2502.10203">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> AI-in-the-Loop Sensing and Communication Joint Design for Edge Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zhijie Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+X">Xiaowen Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Y">Yuanhao Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+K">Kaibin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10203v1-abstract-short" style="display: inline;"> Recent breakthroughs in artificial intelligence (AI), wireless communications, and sensing technologies have accelerated the evolution of edge intelligence. However, conventional systems still grapple with issues such as low communication efficiency, redundant data acquisition, and poor model generalization. To overcome these challenges, we propose an innovative framework that enhances edge intell&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10203v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10203v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10203v1-abstract-full" style="display: none;"> Recent breakthroughs in artificial intelligence (AI), wireless communications, and sensing technologies have accelerated the evolution of edge intelligence. However, conventional systems still grapple with issues such as low communication efficiency, redundant data acquisition, and poor model generalization. To overcome these challenges, we propose an innovative framework that enhances edge intelligence through AI-in-the-loop joint sensing and communication (JSAC). This framework features an AI-driven closed-loop control architecture that jointly optimizes system resources, thereby delivering superior system-level performance. A key contribution of our work is establishing an explicit relationship between validation loss and the system&#39;s tunable parameters. This insight enables dynamic reduction of the generalization error through AI-driven closed-loop control. Specifically, for sensing control, we introduce an adaptive data collection strategy based on gradient importance sampling, allowing edge devices to autonomously decide when to terminate data acquisition and how to allocate sample weights based on real-time model feedback. For communication control, drawing inspiration from stochastic gradient Langevin dynamics (SGLD), our joint optimization of transmission power and batch size converts channel and data noise into gradient perturbations that help mitigate overfitting. Experimental evaluations demonstrate that our framework reduces communication energy consumption by up to 77 percent and sensing costs measured by the number of collected samples by up to 52 percent while significantly improving model generalization -- with up to 58 percent reductions of the final validation loss. It validates that the proposed scheme can harvest the mutual benefit of AI and JSAC systems by incorporating the model itself into the control loop of the system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10203v1-abstract-full').style.display = 'none'; document.getElementById('2502.10203v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09940">arXiv:2502.09940</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.09940">pdf</a>, <a href="https://arxiv.org/format/2502.09940">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Preliminary Exploration with GPT-4o Voice Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09940v1-abstract-short" style="display: inline;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command clas&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09940v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09940v1-abstract-full" style="display: none;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command classification, semantic and grammatical reasoning., multilingual speech recognition, and singing analysis. It also shows greater robustness against hallucinations than other large audio-language models (LALMs). However, it struggles with tasks such as audio duration prediction and instrument classification. Additionally, GPT-4o&#39;s safety mechanisms cause it to decline tasks like speaker identification, age classification, MOS prediction, and audio deepfake detection. Notably, the model exhibits a significantly different refusal rate when responding to speaker verification tasks on different datasets. This is likely due to variations in the accompanying instructions or the quality of the input audio, suggesting the sensitivity of its built-in safeguards. Finally, we acknowledge that model performance varies with evaluation protocols. This report only serves as a preliminary exploration of the current state of LALMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'none'; document.getElementById('2502.09940v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09621">arXiv:2502.09621</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.09621">pdf</a>, <a href="https://arxiv.org/format/2502.09621">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MME-CoT: Benchmarking Chain-of-Thought in Large Multimodal Models for Reasoning Quality, Robustness, and Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongzhi Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Ziyu Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yanwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Y">Yu Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinyan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liuhui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+J">Jianhan Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+C">Claire Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shen Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hongsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09621v1-abstract-short" style="display: inline;"> Answering questions with Chain-of-Thought (CoT) has significantly enhanced the reasoning capabilities of Large Language Models (LLMs), yet its impact on Large Multimodal Models (LMMs) still lacks a systematic assessment and in-depth investigation. In this paper, we introduce MME-CoT, a specialized benchmark evaluating the CoT reasoning performance of LMMs, spanning six domains: math, science, OCR,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09621v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09621v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09621v1-abstract-full" style="display: none;"> Answering questions with Chain-of-Thought (CoT) has significantly enhanced the reasoning capabilities of Large Language Models (LLMs), yet its impact on Large Multimodal Models (LMMs) still lacks a systematic assessment and in-depth investigation. In this paper, we introduce MME-CoT, a specialized benchmark evaluating the CoT reasoning performance of LMMs, spanning six domains: math, science, OCR, logic, space-time, and general scenes. As the first comprehensive study in this area, we propose a thorough evaluation suite incorporating three novel metrics that assess the reasoning quality, robustness, and efficiency at a fine-grained level. Leveraging curated high-quality data and a unique evaluation strategy, we conduct an in-depth analysis of state-of-the-art LMMs, uncovering several key insights: 1) Models with reflection mechanism demonstrate a superior CoT quality, with Kimi k1.5 outperforming GPT-4o and demonstrating the highest quality results; 2) CoT prompting often degrades LMM performance on perception-heavy tasks, suggesting a potentially harmful overthinking behavior; and 3) Although the CoT quality is high, LMMs with reflection exhibit significant inefficiency in both normal response and self-correction phases. We hope MME-CoT serves as a foundation for advancing multimodal reasoning in LMMs. Project Page: https://mmecot.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09621v1-abstract-full').style.display = 'none'; document.getElementById('2502.09621v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://mmecot.github.io/</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Chen%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10