Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,610 results for author: <span class="mathjax">Zhang, B</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21758">arXiv:2503.21758</a> <span> [<a href="https://arxiv.org/pdf/2503.21758">pdf</a>, <a href="https://arxiv.org/format/2503.21758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lumina-Image 2.0: A Unified and Efficient Image Generative Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Q">Qi Qin</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yi Xin</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruoyi Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiting Lu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongyang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Manyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Beddow%2C+W">Will Beddow</a>, <a href="/search/cs?searchtype=author&query=Millon%2C+E">Erwann Millon</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+V">Victor Perez</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21758v1-abstract-short" style="display: inline;"> We introduce Lumina-Image 2.0, an advanced text-to-image generation framework that achieves significant progress compared to previous work, Lumina-Next. Lumina-Image 2.0 is built upon two key principles: (1) Unification - it adopts a unified architecture (Unified Next-DiT) that treats text and image tokens as a joint sequence, enabling natural cross-modal interactions and allowing seamless task ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21758v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21758v1-abstract-full" style="display: none;"> We introduce Lumina-Image 2.0, an advanced text-to-image generation framework that achieves significant progress compared to previous work, Lumina-Next. Lumina-Image 2.0 is built upon two key principles: (1) Unification - it adopts a unified architecture (Unified Next-DiT) that treats text and image tokens as a joint sequence, enabling natural cross-modal interactions and allowing seamless task expansion. Besides, since high-quality captioners can provide semantically well-aligned text-image training pairs, we introduce a unified captioning system, Unified Captioner (UniCap), specifically designed for T2I generation tasks. UniCap excels at generating comprehensive and accurate captions, accelerating convergence and enhancing prompt adherence. (2) Efficiency - to improve the efficiency of our proposed model, we develop multi-stage progressive training strategies and introduce inference acceleration techniques without compromising image quality. Extensive evaluations on academic benchmarks and public text-to-image arenas show that Lumina-Image 2.0 delivers strong performances even with only 2.6B parameters, highlighting its scalability and design efficiency. We have released our training details, code, and models at https://github.com/Alpha-VLLM/Lumina-Image-2.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21758v1-abstract-full').style.display = 'none'; document.getElementById('2503.21758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report, 21 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21749">arXiv:2503.21749</a> <span> [<a href="https://arxiv.org/pdf/2503.21749">pdf</a>, <a href="https://arxiv.org/format/2503.21749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LeX-Art: Rethinking Text Generation via Scalable High-Quality Data Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shitian Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qilong Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Q">Qi Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongyang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21749v1-abstract-short" style="display: inline;"> We introduce LeX-Art, a comprehensive suite for high-quality text-image synthesis that systematically bridges the gap between prompt expressiveness and text rendering fidelity. Our approach follows a data-centric paradigm, constructing a high-quality data synthesis pipeline based on Deepseek-R1 to curate LeX-10K, a dataset of 10K high-resolution, aesthetically refined 1024$\times$1024 images. Beyo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21749v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21749v1-abstract-full" style="display: none;"> We introduce LeX-Art, a comprehensive suite for high-quality text-image synthesis that systematically bridges the gap between prompt expressiveness and text rendering fidelity. Our approach follows a data-centric paradigm, constructing a high-quality data synthesis pipeline based on Deepseek-R1 to curate LeX-10K, a dataset of 10K high-resolution, aesthetically refined 1024$\times$1024 images. Beyond dataset construction, we develop LeX-Enhancer, a robust prompt enrichment model, and train two text-to-image models, LeX-FLUX and LeX-Lumina, achieving state-of-the-art text rendering performance. To systematically evaluate visual text generation, we introduce LeX-Bench, a benchmark that assesses fidelity, aesthetics, and alignment, complemented by Pairwise Normalized Edit Distance (PNED), a novel metric for robust text accuracy evaluation. Experiments demonstrate significant improvements, with LeX-Lumina achieving a 79.81% PNED gain on CreateBench, and LeX-FLUX outperforming baselines in color (+3.18%), positional (+4.45%), and font accuracy (+3.81%). Our codes, models, datasets, and demo are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21749v1-abstract-full').style.display = 'none'; document.getElementById('2503.21749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://zhaoshitian.github.io/lexart/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21144">arXiv:2503.21144</a> <span> [<a href="https://arxiv.org/pdf/2503.21144">pdf</a>, <a href="https://arxiv.org/format/2503.21144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ChatAnyone: Stylized Real-time Portrait Video Generation with Hierarchical Motion Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jinwei Qi</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+C">Chaonan Ji</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sheng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21144v1-abstract-short" style="display: inline;"> Real-time interactive video-chat portraits have been increasingly recognized as the future trend, particularly due to the remarkable progress made in text and voice chat technologies. However, existing methods primarily focus on real-time generation of head movements, but struggle to produce synchronized body motions that match these head actions. Additionally, achieving fine-grained control over… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21144v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21144v1-abstract-full" style="display: none;"> Real-time interactive video-chat portraits have been increasingly recognized as the future trend, particularly due to the remarkable progress made in text and voice chat technologies. However, existing methods primarily focus on real-time generation of head movements, but struggle to produce synchronized body motions that match these head actions. Additionally, achieving fine-grained control over the speaking style and nuances of facial expressions remains a challenge. To address these limitations, we introduce a novel framework for stylized real-time portrait video generation, enabling expressive and flexible video chat that extends from talking head to upper-body interaction. Our approach consists of the following two stages. The first stage involves efficient hierarchical motion diffusion models, that take both explicit and implicit motion representations into account based on audio inputs, which can generate a diverse range of facial expressions with stylistic control and synchronization between head and body movements. The second stage aims to generate portrait video featuring upper-body movements, including hand gestures. We inject explicit hand control signals into the generator to produce more detailed hand movements, and further perform face refinement to enhance the overall realism and expressiveness of the portrait video. Additionally, our approach supports efficient and continuous generation of upper-body portrait video in maximum 512 * 768 resolution at up to 30fps on 4090 GPU, supporting interactive video-chat in real-time. Experimental results demonstrate the capability of our approach to produce portrait videos with rich expressiveness and natural upper-body movements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21144v1-abstract-full').style.display = 'none'; document.getElementById('2503.21144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://humanaigc.github.io/chat-anyone/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20215">arXiv:2503.20215</a> <span> [<a href="https://arxiv.org/pdf/2503.20215">pdf</a>, <a href="https://arxiv.org/format/2503.20215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Qwen2.5-Omni Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jin Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhifang Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Ting He</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shuai Bai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keqin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jialin Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yang Fan</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiong Wang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Y">Yunfei Chu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20215v1-abstract-short" style="display: inline;"> In this report, we present Qwen2.5-Omni, an end-to-end multimodal model designed to perceive diverse modalities, including text, images, audio, and video, while simultaneously generating text and natural speech responses in a streaming manner. To enable the streaming of multimodal information inputs, both audio and visual encoders utilize a block-wise processing approach. To synchronize the timest… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20215v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20215v1-abstract-full" style="display: none;"> In this report, we present Qwen2.5-Omni, an end-to-end multimodal model designed to perceive diverse modalities, including text, images, audio, and video, while simultaneously generating text and natural speech responses in a streaming manner. To enable the streaming of multimodal information inputs, both audio and visual encoders utilize a block-wise processing approach. To synchronize the timestamps of video inputs with audio, we organize the audio and video sequentially in an interleaved manner and propose a novel position embedding approach, named TMRoPE(Time-aligned Multimodal RoPE). To concurrently generate text and speech while avoiding interference between the two modalities, we propose \textbf{Thinker-Talker} architecture. In this framework, Thinker functions as a large language model tasked with text generation, while Talker is a dual-track autoregressive model that directly utilizes the hidden representations from the Thinker to produce audio tokens as output. Both the Thinker and Talker models are designed to be trained and inferred in an end-to-end manner. For decoding audio tokens in a streaming manner, we introduce a sliding-window DiT that restricts the receptive field, aiming to reduce the initial package delay. Qwen2.5-Omni is comparable with the similarly sized Qwen2.5-VL and outperforms Qwen2-Audio. Furthermore, Qwen2.5-Omni achieves state-of-the-art performance on multimodal benchmarks like Omni-Bench. Notably, Qwen2.5-Omni's performance in end-to-end speech instruction following is comparable to its capabilities with text inputs, as evidenced by benchmarks such as MMLU and GSM8K. As for speech generation, Qwen2.5-Omni's streaming Talker outperforms most existing streaming and non-streaming alternatives in robustness and naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20215v1-abstract-full').style.display = 'none'; document.getElementById('2503.20215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17733">arXiv:2503.17733</a> <span> [<a href="https://arxiv.org/pdf/2503.17733">pdf</a>, <a href="https://arxiv.org/format/2503.17733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GS-LTS: 3D Gaussian Splatting-Based Adaptive Modeling for Long-Term Service Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruiping Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17733v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has garnered significant attention in robotics for its explicit, high fidelity dense scene representation, demonstrating strong potential for robotic applications. However, 3DGS-based methods in robotics primarily focus on static scenes, with limited attention to the dynamic scene changes essential for long-term service robots. These robots demand sustained task execut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17733v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17733v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has garnered significant attention in robotics for its explicit, high fidelity dense scene representation, demonstrating strong potential for robotic applications. However, 3DGS-based methods in robotics primarily focus on static scenes, with limited attention to the dynamic scene changes essential for long-term service robots. These robots demand sustained task execution and efficient scene updates-challenges current approaches fail to meet. To address these limitations, we propose GS-LTS (Gaussian Splatting for Long-Term Service), a 3DGS-based system enabling indoor robots to manage diverse tasks in dynamic environments over time. GS-LTS detects scene changes (e.g., object addition or removal) via single-image change detection, employs a rule-based policy to autonomously collect multi-view observations, and efficiently updates the scene representation through Gaussian editing. Additionally, we propose a simulation-based benchmark that automatically generates scene change data as compact configuration scripts, providing a standardized, user-friendly evaluation benchmark. Experimental results demonstrate GS-LTS's advantages in reconstruction, navigation, and superior scene updates-faster and higher quality than the image training baseline-advancing 3DGS for long-term robotic operations. Code and benchmark are available at: https://vipl-vsu.github.io/3DGS-LTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17733v1-abstract-full').style.display = 'none'; document.getElementById('2503.17733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17673">arXiv:2503.17673</a> <span> [<a href="https://arxiv.org/pdf/2503.17673">pdf</a>, <a href="https://arxiv.org/format/2503.17673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DCEvo: Discriminative Cross-Dimensional Evolutionary Learning for Infrared and Visible Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Q">Qingyun Mei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingyuan Li</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yang Zou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiying Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Long Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Risheng Liu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xin Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17673v1-abstract-short" style="display: inline;"> Infrared and visible image fusion integrates information from distinct spectral bands to enhance image quality by leveraging the strengths and mitigating the limitations of each modality. Existing approaches typically treat image fusion and subsequent high-level tasks as separate processes, resulting in fused images that offer only marginal gains in task performance and fail to provide constructiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17673v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17673v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17673v1-abstract-full" style="display: none;"> Infrared and visible image fusion integrates information from distinct spectral bands to enhance image quality by leveraging the strengths and mitigating the limitations of each modality. Existing approaches typically treat image fusion and subsequent high-level tasks as separate processes, resulting in fused images that offer only marginal gains in task performance and fail to provide constructive feedback for optimizing the fusion process. To overcome these limitations, we propose a Discriminative Cross-Dimension Evolutionary Learning Framework, termed DCEvo, which simultaneously enhances visual quality and perception accuracy. Leveraging the robust search capabilities of Evolutionary Learning, our approach formulates the optimization of dual tasks as a multi-objective problem by employing an Evolutionary Algorithm (EA) to dynamically balance loss function parameters. Inspired by visual neuroscience, we integrate a Discriminative Enhancer (DE) within both the encoder and decoder, enabling the effective learning of complementary features from different modalities. Additionally, our Cross-Dimensional Embedding (CDE) block facilitates mutual enhancement between high-dimensional task features and low-dimensional fusion features, ensuring a cohesive and efficient feature integration process. Experimental results on three benchmarks demonstrate that our method significantly outperforms state-of-the-art approaches, achieving an average improvement of 9.32% in visual quality while also enhancing subsequent high-level tasks. The code is available at https://github.com/Beate-Suy-Zhang/DCEvo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17673v1-abstract-full').style.display = 'none'; document.getElementById('2503.17673v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16653">arXiv:2503.16653</a> <span> [<a href="https://arxiv.org/pdf/2503.16653">pdf</a>, <a href="https://arxiv.org/format/2503.16653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> iFlame: Interleaving Full and Linear Attention for Efficient Mesh Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Biao Zhang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+W">Weize Quan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Dong-Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16653v2-abstract-short" style="display: inline;"> This paper propose iFlame, a novel transformer-based network architecture for mesh generation. While attention-based models have demonstrated remarkable performance in mesh generation, their quadratic computational complexity limits scalability, particularly for high-resolution 3D data. Conversely, linear attention mechanisms offer lower computational costs but often struggle to capture long-range… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16653v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16653v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16653v2-abstract-full" style="display: none;"> This paper propose iFlame, a novel transformer-based network architecture for mesh generation. While attention-based models have demonstrated remarkable performance in mesh generation, their quadratic computational complexity limits scalability, particularly for high-resolution 3D data. Conversely, linear attention mechanisms offer lower computational costs but often struggle to capture long-range dependencies, resulting in suboptimal outcomes. To address this trade-off, we propose an interleaving autoregressive mesh generation framework that combines the efficiency of linear attention with the expressive power of full attention mechanisms. To further enhance efficiency and leverage the inherent structure of mesh representations, we integrate this interleaving approach into an hourglass architecture, which significantly boosts efficiency. Our approach reduces training time while achieving performance comparable to pure attention-based models. To improve inference efficiency, we implemented a caching algorithm that almost doubles the speed and reduces the KV cache size by seven-eighths compared to the original Transformer. We evaluate our framework on ShapeNet and Objaverse, demonstrating its ability to generate high-quality 3D meshes efficiently. Our results indicate that the proposed interleaving framework effectively balances computational efficiency and generative performance, making it a practical solution for mesh generation. The training takes only 2 days with 4 GPUs on 39k data with a maximum of 4k faces on Objaverse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16653v2-abstract-full').style.display = 'none'; document.getElementById('2503.16653v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://wanghanxiao123.github.io/iFa/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16492">arXiv:2503.16492</a> <span> [<a href="https://arxiv.org/pdf/2503.16492">pdf</a>, <a href="https://arxiv.org/format/2503.16492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FAM-HRI: Foundation-Model Assisted Multi-Modal Human-Robot Interaction Combining Gaze and Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yuzhi Lai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boya Zhang</a>, <a href="/search/cs?searchtype=author&query=Kiefer%2C+B">Benjamin Kiefer</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peizheng Li</a>, <a href="/search/cs?searchtype=author&query=Zell%2C+A">Andreas Zell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16492v1-abstract-short" style="display: inline;"> Effective Human-Robot Interaction (HRI) is crucial for enhancing accessibility and usability in real-world robotics applications. However, existing solutions often rely on gestures or language commands, making interaction inefficient and ambiguous, particularly for users with physical impairments. In this paper, we introduce FAM-HRI, an efficient multi-modal framework for human-robot interaction t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16492v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16492v1-abstract-full" style="display: none;"> Effective Human-Robot Interaction (HRI) is crucial for enhancing accessibility and usability in real-world robotics applications. However, existing solutions often rely on gestures or language commands, making interaction inefficient and ambiguous, particularly for users with physical impairments. In this paper, we introduce FAM-HRI, an efficient multi-modal framework for human-robot interaction that integrates language and gaze inputs via foundation models. By leveraging lightweight Meta ARIA glasses, our system captures real-time multi-modal signals and utilizes large language models (LLMs) to fuse user intention with scene context, enabling intuitive and precise robot manipulation. Our method accurately determines gaze fixation time interval, reducing noise caused by the gaze dynamic nature. Experimental evaluations demonstrate that FAM-HRI achieves a high success rate in task execution while maintaining a low interaction time, providing a practical solution for individuals with limited physical mobility or motor impairments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16492v1-abstract-full').style.display = 'none'; document.getElementById('2503.16492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15916">arXiv:2503.15916</a> <span> [<a href="https://arxiv.org/pdf/2503.15916">pdf</a>, <a href="https://arxiv.org/format/2503.15916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> ALLMod: Exploring $\underline{\mathbf{A}}$rea-Efficiency of $\underline{\mathbf{L}}$UT-based $\underline{\mathbf{L}}$arge Number $\underline{\mathbf{Mod}}$ular Reduction via Hybrid Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fangxin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haomin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongwu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shoumeng Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Li Jiang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+H">Haibing Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15916v1-abstract-short" style="display: inline;"> Modular arithmetic, particularly modular reduction, is widely used in cryptographic applications such as homomorphic encryption (HE) and zero-knowledge proofs (ZKP). High-bit-width operations are crucial for enhancing security; however, they are computationally intensive due to the large number of modular operations required. The lookup-table-based (LUT-based) approach, a ``space-for-time'' techni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15916v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15916v1-abstract-full" style="display: none;"> Modular arithmetic, particularly modular reduction, is widely used in cryptographic applications such as homomorphic encryption (HE) and zero-knowledge proofs (ZKP). High-bit-width operations are crucial for enhancing security; however, they are computationally intensive due to the large number of modular operations required. The lookup-table-based (LUT-based) approach, a ``space-for-time'' technique, reduces computational load by segmenting the input number into smaller bit groups, pre-computing modular reduction results for each segment, and storing these results in LUTs. While effective, this method incurs significant hardware overhead due to extensive LUT usage. In this paper, we introduce ALLMod, a novel approach that improves the area efficiency of LUT-based large-number modular reduction by employing hybrid workloads. Inspired by the iterative method, ALLMod splits the bit groups into two distinct workloads, achieving lower area costs without compromising throughput. We first develop a template to facilitate workload splitting and ensure balanced distribution. Then, we conduct design space exploration to evaluate the optimal timing for fusing workload results, enabling us to identify the most efficient design under specific constraints. Extensive evaluations show that ALLMod achieves up to $1.65\times$ and $3\times$ improvements in area efficiency over conventional LUT-based methods for bit-widths of $128$ and $8,192$, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15916v1-abstract-full').style.display = 'none'; document.getElementById('2503.15916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 62nd Design Automation Conference ($\bf{DAC\ 2025}$)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15880">arXiv:2503.15880</a> <span> [<a href="https://arxiv.org/pdf/2503.15880">pdf</a>, <a href="https://arxiv.org/format/2503.15880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> InCo-DPO: Balancing Distribution Shift and Data Quality for Enhanced Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jijie Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo-Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liangdong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15880v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) optimizes language models to align with human preferences. Utilizing on-policy samples, generated directly by the policy model, typically results in better performance due to its distribution consistency with the model compared to off-policy samples. This paper identifies the quality of candidate preference samples as another critical factor. While the quality… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15880v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15880v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) optimizes language models to align with human preferences. Utilizing on-policy samples, generated directly by the policy model, typically results in better performance due to its distribution consistency with the model compared to off-policy samples. This paper identifies the quality of candidate preference samples as another critical factor. While the quality of on-policy data is inherently constrained by the capabilities of the policy model, off-policy data, which can be derived from diverse sources, offers greater potential for quality despite experiencing distribution shifts. However, current research mostly relies on on-policy data and neglects the value of off-policy data in terms of data quality, due to the challenge posed by distribution shift. In this paper, we propose InCo-DPO, an efficient method for synthesizing preference data by integrating on-policy and off-policy data, allowing dynamic adjustments to balance distribution shifts and data quality, thus finding an optimal trade-off. Consequently, InCo-DPO overcomes the limitations of distribution shifts in off-policy data and the quality constraints of on-policy data. We evaluated InCo-DPO with the Alpaca-Eval 2.0 and Arena-Hard benchmarks. Experimental results demonstrate that our approach not only outperforms both on-policy and off-policy data but also achieves a state-of-the-art win rate of 60.8 on Arena-Hard with the vanilla DPO using Gemma-2 model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15880v1-abstract-full').style.display = 'none'; document.getElementById('2503.15880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15842">arXiv:2503.15842</a> <span> [<a href="https://arxiv.org/pdf/2503.15842">pdf</a>, <a href="https://arxiv.org/format/2503.15842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedAWA: Adaptive Optimization of Aggregation Weights in Federated Learning Using Client Vectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+C">Changlong Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">He Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingyuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dandan Guo</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15842v1-abstract-short" style="display: inline;"> Federated Learning (FL) has emerged as a promising framework for distributed machine learning, enabling collaborative model training without sharing local data, thereby preserving privacy and enhancing security. However, data heterogeneity resulting from differences across user behaviors, preferences, and device characteristics poses a significant challenge for federated learning. Most previous wo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15842v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15842v1-abstract-full" style="display: none;"> Federated Learning (FL) has emerged as a promising framework for distributed machine learning, enabling collaborative model training without sharing local data, thereby preserving privacy and enhancing security. However, data heterogeneity resulting from differences across user behaviors, preferences, and device characteristics poses a significant challenge for federated learning. Most previous works overlook the adjustment of aggregation weights, relying solely on dataset size for weight assignment, which often leads to unstable convergence and reduced model performance. Recently, several studies have sought to refine aggregation strategies by incorporating dataset characteristics and model alignment. However, adaptively adjusting aggregation weights while ensuring data security-without requiring additional proxy data-remains a significant challenge. In this work, we propose Federated learning with Adaptive Weight Aggregation (FedAWA), a novel method that adaptively adjusts aggregation weights based on client vectors during the learning process. The client vector captures the direction of model updates, reflecting local data variations, and is used to optimize the aggregation weight without requiring additional datasets or violating privacy. By assigning higher aggregation weights to local models whose updates align closely with the global optimization direction, FedAWA enhances the stability and generalization of the global model. Extensive experiments under diverse scenarios demonstrate the superiority of our method, providing a promising solution to the challenges of data heterogeneity in federated learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15842v1-abstract-full').style.display = 'none'; document.getElementById('2503.15842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14182">arXiv:2503.14182</a> <span> [<a href="https://arxiv.org/pdf/2503.14182">pdf</a>, <a href="https://arxiv.org/format/2503.14182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bridging Past and Future: End-to-End Autonomous Driving with Historical Prediction and Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bozhou Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+N">Nan Song</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xin Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14182v1-abstract-short" style="display: inline;"> End-to-end autonomous driving unifies tasks in a differentiable framework, enabling planning-oriented optimization and attracting growing attention. Current methods aggregate historical information either through dense historical bird's-eye-view (BEV) features or by querying a sparse memory bank, following paradigms inherited from detection. However, we argue that these paradigms either omit histo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14182v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14182v1-abstract-full" style="display: none;"> End-to-end autonomous driving unifies tasks in a differentiable framework, enabling planning-oriented optimization and attracting growing attention. Current methods aggregate historical information either through dense historical bird's-eye-view (BEV) features or by querying a sparse memory bank, following paradigms inherited from detection. However, we argue that these paradigms either omit historical information in motion planning or fail to align with its multi-step nature, which requires predicting or planning multiple future time steps. In line with the philosophy of future is a continuation of past, we propose BridgeAD, which reformulates motion and planning queries as multi-step queries to differentiate the queries for each future time step. This design enables the effective use of historical prediction and planning by applying them to the appropriate parts of the end-to-end system based on the time steps, which improves both perception and motion planning. Specifically, historical queries for the current frame are combined with perception, while queries for future frames are integrated with motion planning. In this way, we bridge the gap between past and future by aggregating historical insights at every time step, enhancing the overall coherence and accuracy of the end-to-end autonomous driving pipeline. Extensive experiments on the nuScenes dataset in both open-loop and closed-loop settings demonstrate that BridgeAD achieves state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14182v1-abstract-full').style.display = 'none'; document.getElementById('2503.14182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13537">arXiv:2503.13537</a> <span> [<a href="https://arxiv.org/pdf/2503.13537">pdf</a>, <a href="https://arxiv.org/ps/2503.13537">ps</a>, <a href="https://arxiv.org/format/2503.13537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedTilt: Towards Multi-Level Fairness-Preserving and Robust Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Binghui Zhang</a>, <a href="/search/cs?searchtype=author&query=De+La+Cruz%2C+L+M">Luis Mares De La Cruz</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binghui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13537v1-abstract-short" style="display: inline;"> Federated Learning (FL) is an emerging decentralized learning paradigm that can partly address the privacy concern that cannot be handled by traditional centralized and distributed learning. Further, to make FL practical, it is also necessary to consider constraints such as fairness and robustness. However, existing robust FL methods often produce unfair models, and existing fair FL methods only c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13537v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13537v1-abstract-full" style="display: none;"> Federated Learning (FL) is an emerging decentralized learning paradigm that can partly address the privacy concern that cannot be handled by traditional centralized and distributed learning. Further, to make FL practical, it is also necessary to consider constraints such as fairness and robustness. However, existing robust FL methods often produce unfair models, and existing fair FL methods only consider one-level (client) fairness and are not robust to persistent outliers (i.e., injected outliers into each training round) that are common in real-world FL settings. We propose \texttt{FedTilt}, a novel FL that can preserve multi-level fairness and be robust to outliers. In particular, we consider two common levels of fairness, i.e., \emph{client fairness} -- uniformity of performance across clients, and \emph{client data fairness} -- uniformity of performance across different classes of data within a client. \texttt{FedTilt} is inspired by the recently proposed tilted empirical risk minimization, which introduces tilt hyperparameters that can be flexibly tuned. Theoretically, we show how tuning tilt values can achieve the two-level fairness and mitigate the persistent outliers, and derive the convergence condition of \texttt{FedTilt} as well. Empirically, our evaluation results on a suite of realistic federated datasets in diverse settings show the effectiveness and flexibility of the \texttt{FedTilt} framework and the superiority to the state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13537v1-abstract-full').style.display = 'none'; document.getElementById('2503.13537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11730">arXiv:2503.11730</a> <span> [<a href="https://arxiv.org/pdf/2503.11730">pdf</a>, <a href="https://arxiv.org/format/2503.11730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BACE-RUL: A Bi-directional Adversarial Network with Covariate Encoding for Machine Remaining Useful Life Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shunyu Wu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Junya Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S+K">See Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11730v1-abstract-short" style="display: inline;"> Prognostic and Health Management (PHM) are crucial ways to avoid unnecessary maintenance for Cyber-Physical Systems (CPS) and improve system reliability. Predicting the Remaining Useful Life (RUL) is one of the most challenging tasks for PHM. Existing methods require prior knowledge about the system, contrived assumptions, or temporal mining to model the life cycles of machine equipment/devices, r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11730v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11730v1-abstract-full" style="display: none;"> Prognostic and Health Management (PHM) are crucial ways to avoid unnecessary maintenance for Cyber-Physical Systems (CPS) and improve system reliability. Predicting the Remaining Useful Life (RUL) is one of the most challenging tasks for PHM. Existing methods require prior knowledge about the system, contrived assumptions, or temporal mining to model the life cycles of machine equipment/devices, resulting in diminished accuracy and limited applicability in real-world scenarios. This paper proposes a Bi-directional Adversarial network with Covariate Encoding for machine Remaining Useful Life (BACE-RUL) prediction, which only adopts sensor measurements from the current life cycle to predict RUL rather than relying on previous consecutive cycle recordings. The current sensor measurements of mechanical devices are encoded to a conditional space to better understand the implicit inner mechanical status. The predictor is trained as a conditional generative network with the encoded sensor measurements as its conditions. Various experiments on several real-world datasets, including the turbofan aircraft engine dataset and the dataset collected from degradation experiments of Li-Ion battery cells, show that the proposed model is a general framework and outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11730v1-abstract-full').style.display = 'none'; document.getElementById('2503.11730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been received as a research paper at CollaborateCom 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11368">arXiv:2503.11368</a> <span> [<a href="https://arxiv.org/pdf/2503.11368">pdf</a>, <a href="https://arxiv.org/format/2503.11368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PBR3DGen: A VLM-guided Mesh Generation with High-quality PBR Texture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaokang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianghui Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chunchao Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xi Zhao</a>, <a href="/search/cs?searchtype=author&query=Luximon%2C+Y">Yan Luximon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11368v1-abstract-short" style="display: inline;"> Generating high-quality physically based rendering (PBR) materials is important to achieve realistic rendering in the downstream tasks, yet it remains challenging due to the intertwined effects of materials and lighting. While existing methods have made breakthroughs by incorporating material decomposition in the 3D generation pipeline, they tend to bake highlights into albedo and ignore spatially… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11368v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11368v1-abstract-full" style="display: none;"> Generating high-quality physically based rendering (PBR) materials is important to achieve realistic rendering in the downstream tasks, yet it remains challenging due to the intertwined effects of materials and lighting. While existing methods have made breakthroughs by incorporating material decomposition in the 3D generation pipeline, they tend to bake highlights into albedo and ignore spatially varying properties of metallicity and roughness. In this work, we present PBR3DGen, a two-stage mesh generation method with high-quality PBR materials that integrates the novel multi-view PBR material estimation model and a 3D PBR mesh reconstruction model. Specifically, PBR3DGen leverages vision language models (VLM) to guide multi-view diffusion, precisely capturing the spatial distribution and inherent attributes of reflective-metalness material. Additionally, we incorporate view-dependent illumination-aware conditions as pixel-aware priors to enhance spatially varying material properties. Furthermore, our reconstruction model reconstructs high-quality mesh with PBR materials. Experimental results demonstrate that PBR3DGen significantly outperforms existing methods, achieving new state-of-the-art results for PBR estimation and mesh generation. More results and visualization can be found on our project page: https://pbr3dgen1218.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11368v1-abstract-full').style.display = 'none'; document.getElementById('2503.11368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Homepage: https://pbr3dgen1218.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11292">arXiv:2503.11292</a> <span> [<a href="https://arxiv.org/pdf/2503.11292">pdf</a>, <a href="https://arxiv.org/format/2503.11292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Corrected Riemann smoothed particle hydrodynamics method for multi-resolution fluid-structure interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianfeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangyu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11292v1-abstract-short" style="display: inline;"> As a mesh-free method, smoothed particle hydrodynamics (SPH) has been widely used for modeling and simulating fluid-structure interaction (FSI) problems. While the kernel gradient correction (KGC) method is commonly applied in structural domains to enhance numerical consistency, high-order consistency corrections that preserve conservation remain underutilized in fluid domains despite their critic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11292v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11292v1-abstract-full" style="display: none;"> As a mesh-free method, smoothed particle hydrodynamics (SPH) has been widely used for modeling and simulating fluid-structure interaction (FSI) problems. While the kernel gradient correction (KGC) method is commonly applied in structural domains to enhance numerical consistency, high-order consistency corrections that preserve conservation remain underutilized in fluid domains despite their critical role in FSI analysis, especially for the multi-resolution scheme where fluid domains generally have a low resolution. In this study, we incorporate the reverse kernel gradient correction (RKGC) formulation, a conservative high-order consistency approximation, into the fluid discretization for solving FSI problems. RKGC has been proven to achieve exact second-order convergence with relaxed particles and improve numerical accuracy while particularly enhancing energy conservation in free-surface flow simulations. By integrating this correction into the Riemann SPH method to solve different typical FSI problems with a multi-resolution scheme, numerical results consistently show improvements in accuracy and convergence compared to uncorrected fluid discretization. Despite these advances, further refinement of correction techniques for solid domains and fluid-structure interfaces remains significant for enhancing the overall accuracy of SPH-based FSI modeling and simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11292v1-abstract-full').style.display = 'none'; document.getElementById('2503.11292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">47 pages 19 figues</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11043">arXiv:2503.11043</a> <span> [<a href="https://arxiv.org/pdf/2503.11043">pdf</a>, <a href="https://arxiv.org/format/2503.11043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InverseBench: Benchmarking Plug-and-Play Diffusion Priors for Inverse Problems in Physical Sciences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hongkai Zheng</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+W">Wenda Chu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihui Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Austin Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Caifeng Zou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a>, <a href="/search/cs?searchtype=author&query=Kovachki%2C+N">Nikola Kovachki</a>, <a href="/search/cs?searchtype=author&query=Ross%2C+Z+E">Zachary E. Ross</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yisong Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11043v1-abstract-short" style="display: inline;"> Plug-and-play diffusion priors (PnPDP) have emerged as a promising research direction for solving inverse problems. However, current studies primarily focus on natural image restoration, leaving the performance of these algorithms in scientific inverse problems largely unexplored. To address this gap, we introduce \textsc{InverseBench}, a framework that evaluates diffusion models across five dis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11043v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11043v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11043v1-abstract-full" style="display: none;"> Plug-and-play diffusion priors (PnPDP) have emerged as a promising research direction for solving inverse problems. However, current studies primarily focus on natural image restoration, leaving the performance of these algorithms in scientific inverse problems largely unexplored. To address this gap, we introduce \textsc{InverseBench}, a framework that evaluates diffusion models across five distinct scientific inverse problems. These problems present unique structural challenges that differ from existing benchmarks, arising from critical scientific applications such as optical tomography, medical imaging, black hole imaging, seismology, and fluid dynamics. With \textsc{InverseBench}, we benchmark 14 inverse problem algorithms that use plug-and-play diffusion priors against strong, domain-specific baselines, offering valuable new insights into the strengths and weaknesses of existing algorithms. To facilitate further research and development, we open-source the codebase, along with datasets and pre-trained models, at https://devzhk.github.io/InverseBench/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11043v1-abstract-full').style.display = 'none'; document.getElementById('2503.11043v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10677">arXiv:2503.10677</a> <span> [<a href="https://arxiv.org/pdf/2503.10677">pdf</a>, <a href="https://arxiv.org/format/2503.10677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Knowledge-Oriented Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Mingyue Cheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yucong Luo</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+J">Jie Ouyang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huijie Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bohou Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiawei Cao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jie Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10677v2-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has gained significant attention in recent years for its potential to enhance natural language understanding and generation by combining large-scale retrieval systems with generative models. RAG leverages external knowledge sources, such as documents, databases, or structured data, to improve model performance and generate more accurate and contextually relevan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10677v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10677v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10677v2-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has gained significant attention in recent years for its potential to enhance natural language understanding and generation by combining large-scale retrieval systems with generative models. RAG leverages external knowledge sources, such as documents, databases, or structured data, to improve model performance and generate more accurate and contextually relevant outputs. This survey aims to provide a comprehensive overview of RAG by examining its fundamental components, including retrieval mechanisms, generation processes, and the integration between the two. We discuss the key characteristics of RAG, such as its ability to augment generative models with dynamic external knowledge, and the challenges associated with aligning retrieved information with generative objectives. We also present a taxonomy that categorizes RAG methods, ranging from basic retrieval-augmented approaches to more advanced models incorporating multi-modal data and reasoning capabilities. Additionally, we review the evaluation benchmarks and datasets commonly used to assess RAG systems, along with a detailed exploration of its applications in fields such as question answering, summarization, and information retrieval. Finally, we highlight emerging research directions and opportunities for improving RAG systems, such as enhanced retrieval efficiency, model interpretability, and domain-specific adaptations. This paper concludes by outlining the prospects for RAG in addressing real-world challenges and its potential to drive further advancements in natural language processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10677v2-abstract-full').style.display = 'none'; document.getElementById('2503.10677v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10618">arXiv:2503.10618</a> <span> [<a href="https://arxiv.org/pdf/2503.10618">pdf</a>, <a href="https://arxiv.org/format/2503.10618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiT-Air: Revisiting the Efficiency of Diffusion Model Architecture Design in Text to Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Rui Qian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenze Hu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tsu-Jui Fu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+J">Jialing Tong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinze Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lezhi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Schwing%2C+A">Alex Schwing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yinfei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10618v2-abstract-short" style="display: inline;"> In this work, we empirically study Diffusion Transformers (DiTs) for text-to-image generation, focusing on architectural choices, text-conditioning strategies, and training protocols. We evaluate a range of DiT-based architectures--including PixArt-style and MMDiT variants--and compare them with a standard DiT variant which directly processes concatenated text and noise inputs. Surprisingly, our f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10618v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10618v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10618v2-abstract-full" style="display: none;"> In this work, we empirically study Diffusion Transformers (DiTs) for text-to-image generation, focusing on architectural choices, text-conditioning strategies, and training protocols. We evaluate a range of DiT-based architectures--including PixArt-style and MMDiT variants--and compare them with a standard DiT variant which directly processes concatenated text and noise inputs. Surprisingly, our findings reveal that the performance of standard DiT is comparable with those specialized models, while demonstrating superior parameter-efficiency, especially when scaled up. Leveraging the layer-wise parameter sharing strategy, we achieve a further reduction of 66% in model size compared to an MMDiT architecture, with minimal performance impact. Building on an in-depth analysis of critical components such as text encoders and Variational Auto-Encoders (VAEs), we introduce DiT-Air and DiT-Air-Lite. With supervised and reward fine-tuning, DiT-Air achieves state-of-the-art performance on GenEval and T2I CompBench, while DiT-Air-Lite remains highly competitive, surpassing most existing models despite its compact size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10618v2-abstract-full').style.display = 'none'; document.getElementById('2503.10618v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10615">arXiv:2503.10615</a> <span> [<a href="https://arxiv.org/pdf/2503.10615">pdf</a>, <a href="https://arxiv.org/format/2503.10615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> R1-Onevision: Advancing Generalized Multimodal Reasoning through Cross-Modal Formalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxuan He</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hongkun Pan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yan Deng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xingtao Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoyu Lu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dacheng Yin</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+F">Fengyun Rao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minfeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10615v2-abstract-short" style="display: inline;"> Large Language Models have demonstrated remarkable reasoning capability in complex textual tasks. However, multimodal reasoning, which requires integrating visual and textual information, remains a significant challenge. Existing visual-language models often struggle to effectively analyze and reason visual content, resulting in suboptimal performance on complex reasoning tasks. Moreover, the abse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10615v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10615v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10615v2-abstract-full" style="display: none;"> Large Language Models have demonstrated remarkable reasoning capability in complex textual tasks. However, multimodal reasoning, which requires integrating visual and textual information, remains a significant challenge. Existing visual-language models often struggle to effectively analyze and reason visual content, resulting in suboptimal performance on complex reasoning tasks. Moreover, the absence of comprehensive benchmarks hinders the accurate assessment of multimodal reasoning capabilities. In this paper, we introduce R1-Onevision, a multimodal reasoning model designed to bridge the gap between visual perception and deep reasoning. To achieve this, we propose a cross-modal reasoning pipeline that transforms images into formal textural representations, enabling precise language-based reasoning. Leveraging this pipeline, we construct the R1-Onevision dataset which provides detailed, step-by-step multimodal reasoning annotations across diverse domains. We further develop the R1-Onevision model through supervised fine-tuning and reinforcement learning to cultivate advanced reasoning and robust generalization abilities. To comprehensively evaluate multimodal reasoning performance across different grades, we introduce R1-Onevision-Bench, a benchmark aligned with human educational stages, covering exams from junior high school to university and beyond. Experimental results show that R1-Onevision achieves state-of-the-art performance, outperforming models such as GPT-4o and Qwen2.5-VL on multiple challenging multimodal reasoning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10615v2-abstract-full').style.display = 'none'; document.getElementById('2503.10615v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and Model: https://github.com/Fancy-MLLM/R1-onevision</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10489">arXiv:2503.10489</a> <span> [<a href="https://arxiv.org/pdf/2503.10489">pdf</a>, <a href="https://arxiv.org/format/2503.10489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Atoms: Enhancing Molecular Pretrained Representations with 3D Space Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shuqi Lu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiaohong Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bohang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lin Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+G">Guolin Ke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10489v2-abstract-short" style="display: inline;"> Molecular pretrained representations (MPR) has emerged as a powerful approach for addressing the challenge of limited supervised data in applications such as drug discovery and material design. While early MPR methods relied on 1D sequences and 2D graphs, recent advancements have incorporated 3D conformational information to capture rich atomic interactions. However, these prior models treat molec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10489v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10489v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10489v2-abstract-full" style="display: none;"> Molecular pretrained representations (MPR) has emerged as a powerful approach for addressing the challenge of limited supervised data in applications such as drug discovery and material design. While early MPR methods relied on 1D sequences and 2D graphs, recent advancements have incorporated 3D conformational information to capture rich atomic interactions. However, these prior models treat molecules merely as discrete atom sets, overlooking the space surrounding them. We argue from a physical perspective that only modeling these discrete points is insufficient. We first present a simple yet insightful observation: naively adding randomly sampled virtual points beyond atoms can surprisingly enhance MPR performance. In light of this, we propose a principled framework that incorporates the entire 3D space spanned by molecules. We implement the framework via a novel Transformer-based architecture, dubbed SpaceFormer, with three key components: (1) grid-based space discretization; (2) grid sampling/merging; and (3) efficient 3D positional encoding. Extensive experiments show that SpaceFormer significantly outperforms previous 3D MPR models across various downstream tasks with limited data, validating the benefit of leveraging the additional 3D space beyond atoms in MPR models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10489v2-abstract-full').style.display = 'none'; document.getElementById('2503.10489v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10009">arXiv:2503.10009</a> <span> [<a href="https://arxiv.org/pdf/2503.10009">pdf</a>, <a href="https://arxiv.org/format/2503.10009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> OR-LLM-Agent: Automating Modeling and Solving of Operations Research Optimization Problem with Reasoning Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Pengcheng Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10009v1-abstract-short" style="display: inline;"> Operations Research (OR) has been widely applied in various fields such as resource allocation, production planning, and supply chain management. However, addressing real-world OR problems requires OR experts to perform mathematical modeling and programmers to develop solution algorithms. This traditional method, heavily reliant on experts, is costly and has long development cycles, severely limit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10009v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10009v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10009v1-abstract-full" style="display: none;"> Operations Research (OR) has been widely applied in various fields such as resource allocation, production planning, and supply chain management. However, addressing real-world OR problems requires OR experts to perform mathematical modeling and programmers to develop solution algorithms. This traditional method, heavily reliant on experts, is costly and has long development cycles, severely limiting the widespread adoption of OR techniques. Few have considered using Artificial Intelligence (AI) to replace professionals to achieve fully automated solutions for OR problems. We propose OR-LLM-Agent, the first AI agent that enables end-to-end automation for solving real-world OR problems. OR-LLM-Agent leverages the Chain-of-Thought (CoT) reasoning capabilities of Large Language Models (LLMs) to translate natural language problem descriptions into formal mathematical models and automatically generate Gurobi solver code. In OR-LLM-Agent, OR-CodeAgent is designed to automate code execution and repair within a sandbox environment, facilitating the derivation of the final solution. Due to the lack of dedicated benchmark datasets for evaluating the automated solving of OR problems, we construct a benchmark dataset comprising 83 real-world OR problems described in natural language. We conduct comparative experiments with state-of-the-art (SOTA) reasoning LLMs, including GPT-o3-mini, DeepSeek-R1, and Gemini 2.0 Flash Thinking. The OR-LLM-Agent achieved the highest pass rate of 100% and the highest solution accuracy of 85%, demonstrating the feasibility of automated OR problem-solving. Data and code have been publicly available at https://github.com/bwz96sco/or_llm_agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10009v1-abstract-full').style.display = 'none'; document.getElementById('2503.10009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09631">arXiv:2503.09631</a> <span> [<a href="https://arxiv.org/pdf/2503.09631">pdf</a>, <a href="https://arxiv.org/format/2503.09631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> V2M4: 4D Mesh Animation Reconstruction from a Single Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Biao Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangjun Tang</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09631v1-abstract-short" style="display: inline;"> We present V2M4, a novel 4D reconstruction method that directly generates a usable 4D mesh animation asset from a single monocular video. Unlike existing approaches that rely on priors from multi-view image and video generation models, our method is based on native 3D mesh generation models. Naively applying 3D mesh generation models to generate a mesh for each frame in a 4D task can lead to issue… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09631v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09631v1-abstract-full" style="display: none;"> We present V2M4, a novel 4D reconstruction method that directly generates a usable 4D mesh animation asset from a single monocular video. Unlike existing approaches that rely on priors from multi-view image and video generation models, our method is based on native 3D mesh generation models. Naively applying 3D mesh generation models to generate a mesh for each frame in a 4D task can lead to issues such as incorrect mesh poses, misalignment of mesh appearance, and inconsistencies in mesh geometry and texture maps. To address these problems, we propose a structured workflow that includes camera search and mesh reposing, condition embedding optimization for mesh appearance refinement, pairwise mesh registration for topology consistency, and global texture map optimization for texture consistency. Our method outputs high-quality 4D animated assets that are compatible with mainstream graphics and game software. Experimental results across a variety of animation types and motion amplitudes demonstrate the generalization and effectiveness of our method. Project page:https://windvchen.github.io/V2M4/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09631v1-abstract-full').style.display = 'none'; document.getElementById('2503.09631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page:https://windvchen.github.io/V2M4/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08422">arXiv:2503.08422</a> <span> [<a href="https://arxiv.org/pdf/2503.08422">pdf</a>, <a href="https://arxiv.org/format/2503.08422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JiSAM: Alleviate Labeling Burden and Corner Case Problems in Autonomous Driving via Minimal Real-World Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runjian Chen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shaoshuai Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Li Jiang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08422v2-abstract-short" style="display: inline;"> Deep-learning-based autonomous driving (AD) perception introduces a promising picture for safe and environment-friendly transportation. However, the over-reliance on real labeled data in LiDAR perception limits the scale of on-road attempts. 3D real world data is notoriously time-and-energy-consuming to annotate and lacks corner cases like rare traffic participants. On the contrary, in simulators… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08422v2-abstract-full').style.display = 'inline'; document.getElementById('2503.08422v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08422v2-abstract-full" style="display: none;"> Deep-learning-based autonomous driving (AD) perception introduces a promising picture for safe and environment-friendly transportation. However, the over-reliance on real labeled data in LiDAR perception limits the scale of on-road attempts. 3D real world data is notoriously time-and-energy-consuming to annotate and lacks corner cases like rare traffic participants. On the contrary, in simulators like CARLA, generating labeled LiDAR point clouds with corner cases is a piece of cake. However, introducing synthetic point clouds to improve real perception is non-trivial. This stems from two challenges: 1) sample efficiency of simulation datasets 2) simulation-to-real gaps. To overcome both challenges, we propose a plug-and-play method called JiSAM , shorthand for Jittering augmentation, domain-aware backbone and memory-based Sectorized AlignMent. In extensive experiments conducted on the famous AD dataset NuScenes, we demonstrate that, with SOTA 3D object detector, JiSAM is able to utilize the simulation data and only labels on 2.5% available real data to achieve comparable performance to models trained on all real data. Additionally, JiSAM achieves more than 15 mAPs on the objects not labeled in the real training set. We will release models and codes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08422v2-abstract-full').style.display = 'none'; document.getElementById('2503.08422v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08292">arXiv:2503.08292</a> <span> [<a href="https://arxiv.org/pdf/2503.08292">pdf</a>, <a href="https://arxiv.org/format/2503.08292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Outpatient Referral: Problem Definition, Benchmarking and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxiao Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qingying Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junying Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiangyi Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiangbo Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bairui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xiang Wan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jian Chang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guangjun Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08292v1-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly applied to outpatient referral tasks across healthcare systems. However, there is a lack of standardized evaluation criteria to assess their effectiveness, particularly in dynamic, interactive scenarios. In this study, we systematically examine the capabilities and limitations of LLMs in managing tasks within Intelligent Outpatient Referral (IOR) syste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08292v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08292v1-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly applied to outpatient referral tasks across healthcare systems. However, there is a lack of standardized evaluation criteria to assess their effectiveness, particularly in dynamic, interactive scenarios. In this study, we systematically examine the capabilities and limitations of LLMs in managing tasks within Intelligent Outpatient Referral (IOR) systems and propose a comprehensive evaluation framework specifically designed for such systems. This framework comprises two core tasks: static evaluation, which focuses on evaluating the ability of predefined outpatient referrals, and dynamic evaluation, which evaluates capabilities of refining outpatient referral recommendations through iterative dialogues. Our findings suggest that LLMs offer limited advantages over BERT-like models, but show promise in asking effective questions during interactive dialogues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08292v1-abstract-full').style.display = 'none'; document.getElementById('2503.08292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07968">arXiv:2503.07968</a> <span> [<a href="https://arxiv.org/pdf/2503.07968">pdf</a>, <a href="https://arxiv.org/format/2503.07968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LabelCoRank: Revolutionizing Long Tail Multi-Label Classification with Co-Occurrence Reranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yan Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo-Wen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07968v1-abstract-short" style="display: inline;"> Motivation: Despite recent advancements in semantic representation driven by pre-trained and large-scale language models, addressing long tail challenges in multi-label text classification remains a significant issue. Long tail challenges have persistently posed difficulties in accurately classifying less frequent labels. Current approaches often focus on improving text semantics while neglecting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07968v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07968v1-abstract-full" style="display: none;"> Motivation: Despite recent advancements in semantic representation driven by pre-trained and large-scale language models, addressing long tail challenges in multi-label text classification remains a significant issue. Long tail challenges have persistently posed difficulties in accurately classifying less frequent labels. Current approaches often focus on improving text semantics while neglecting the crucial role of label relationships. Results: This paper introduces LabelCoRank, a novel approach inspired by ranking principles. LabelCoRank leverages label co-occurrence relationships to refine initial label classifications through a dual-stage reranking process. The first stage uses initial classification results to form a preliminary ranking. In the second stage, a label co-occurrence matrix is utilized to rerank the preliminary results, enhancing the accuracy and relevance of the final classifications. By integrating the reranked label representations as additional text features, LabelCoRank effectively mitigates long tail issues in multi-labeltext classification. Experimental evaluations on popular datasets including MAG-CS, PubMed, and AAPD demonstrate the effectiveness and robustness of LabelCoRank. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07968v1-abstract-full').style.display = 'none'; document.getElementById('2503.07968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07167">arXiv:2503.07167</a> <span> [<a href="https://arxiv.org/pdf/2503.07167">pdf</a>, <a href="https://arxiv.org/format/2503.07167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Temporal Overlapping Prediction: A Self-supervised Pre-training Method for LiDAR Moving Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+Z">Ziliang Miao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runjian Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yixi Cai</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Buwei He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenquan Zhao</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07167v1-abstract-short" style="display: inline;"> Moving object segmentation (MOS) on LiDAR point clouds is crucial for autonomous systems like self-driving vehicles. Previous supervised approaches rely heavily on costly manual annotations, while LiDAR sequences naturally capture temporal motion cues that can be leveraged for self-supervised learning. In this paper, we propose \textbf{T}emporal \textbf{O}verlapping \textbf{P}rediction (\textbf{TO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07167v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07167v1-abstract-full" style="display: none;"> Moving object segmentation (MOS) on LiDAR point clouds is crucial for autonomous systems like self-driving vehicles. Previous supervised approaches rely heavily on costly manual annotations, while LiDAR sequences naturally capture temporal motion cues that can be leveraged for self-supervised learning. In this paper, we propose \textbf{T}emporal \textbf{O}verlapping \textbf{P}rediction (\textbf{TOP}), a self-supervised pre-training method that alleviate the labeling burden for MOS. \textbf{TOP} explores the temporal overlapping points that commonly observed by current and adjacent scans, and learns spatiotemporal representations by predicting the occupancy states of temporal overlapping points. Moreover, we utilize current occupancy reconstruction as an auxiliary pre-training objective, which enhances the current structural awareness of the model. We conduct extensive experiments and observe that the conventional metric Intersection-over-Union (IoU) shows strong bias to objects with more scanned points, which might neglect small or distant objects. To compensate for this bias, we introduce an additional metric called $\text{mIoU}_{\text{obj}}$ to evaluate object-level performance. Experiments on nuScenes and SemanticKITTI show that \textbf{TOP} outperforms both supervised training-from-scratch baseline and other self-supervised pre-training baselines by up to 28.77\% relative improvement, demonstrating strong transferability across LiDAR setups and generalization to other tasks. Code and pre-trained models will be publicly available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07167v1-abstract-full').style.display = 'none'; document.getElementById('2503.07167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06486">arXiv:2503.06486</a> <span> [<a href="https://arxiv.org/pdf/2503.06486">pdf</a>, <a href="https://arxiv.org/format/2503.06486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PerturboLLaVA: Reducing Multimodal Hallucinations with Perturbative Visual Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+C">Chenchen Jing</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yizhou Zhou</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+F">Fengyun Rao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06486v1-abstract-short" style="display: inline;"> This paper aims to address the challenge of hallucinations in Multimodal Large Language Models (MLLMs) particularly for dense image captioning tasks. To tackle the challenge, we identify the current lack of a metric that finely measures the caption quality in concept level. We hereby introduce HalFscore, a novel metric built upon the language graph and is designed to evaluate both the accuracy and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06486v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06486v1-abstract-full" style="display: none;"> This paper aims to address the challenge of hallucinations in Multimodal Large Language Models (MLLMs) particularly for dense image captioning tasks. To tackle the challenge, we identify the current lack of a metric that finely measures the caption quality in concept level. We hereby introduce HalFscore, a novel metric built upon the language graph and is designed to evaluate both the accuracy and completeness of dense captions at a granular level. Additionally, we identify the root cause of hallucination as the model's over-reliance on its language prior. To address this, we propose PerturboLLaVA, which reduces the model's reliance on the language prior by incorporating adversarially perturbed text during training. This method enhances the model's focus on visual inputs, effectively reducing hallucinations and producing accurate, image-grounded descriptions without incurring additional computational overhead. PerturboLLaVA significantly improves the fidelity of generated captions, outperforming existing approaches in handling multimodal hallucinations and achieving improved performance across general multimodal benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06486v1-abstract-full').style.display = 'none'; document.getElementById('2503.06486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05088">arXiv:2503.05088</a> <span> [<a href="https://arxiv.org/pdf/2503.05088">pdf</a>, <a href="https://arxiv.org/format/2503.05088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An End-to-End Learning-Based Multi-Sensor Fusion for Autonomous Vehicle Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Changhong Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiarong Lin</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhiqiang Sui</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">XiaoZhi Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+K">Kehua Sheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05088v1-abstract-short" style="display: inline;"> Multi-sensor fusion is essential for autonomous vehicle localization, as it is capable of integrating data from various sources for enhanced accuracy and reliability. The accuracy of the integrated location and orientation depends on the precision of the uncertainty modeling. Traditional methods of uncertainty modeling typically assume a Gaussian distribution and involve manual heuristic parameter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05088v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05088v1-abstract-full" style="display: none;"> Multi-sensor fusion is essential for autonomous vehicle localization, as it is capable of integrating data from various sources for enhanced accuracy and reliability. The accuracy of the integrated location and orientation depends on the precision of the uncertainty modeling. Traditional methods of uncertainty modeling typically assume a Gaussian distribution and involve manual heuristic parameter tuning. However, these methods struggle to scale effectively and address long-tail scenarios. To address these challenges, we propose a learning-based method that encodes sensor information using higher-order neural network features, thereby eliminating the need for uncertainty estimation. This method significantly eliminates the need for parameter fine-tuning by developing an end-to-end neural network that is specifically designed for multi-sensor fusion. In our experiments, we demonstrate the effectiveness of our approach in real-world autonomous driving scenarios. Results show that the proposed method outperforms existing multi-sensor fusion methods in terms of both accuracy and robustness. A video of the results can be viewed at https://youtu.be/q4iuobMbjME. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05088v1-abstract-full').style.display = 'none'; document.getElementById('2503.05088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures, to be published in ICRA2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04837">arXiv:2503.04837</a> <span> [<a href="https://arxiv.org/pdf/2503.04837">pdf</a>, <a href="https://arxiv.org/format/2503.04837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FedPalm: A General Federated Learning Framework for Closed- and Open-Set Palmprint Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chengrui Gao</a>, <a href="/search/cs?searchtype=author&query=Teoh%2C+A+B+J">Andrew Beng Jin Teoh</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bob Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04837v1-abstract-short" style="display: inline;"> Current deep learning (DL)-based palmprint verification models rely on centralized training with large datasets, which raises significant privacy concerns due to biometric data's sensitive and immutable nature. Federated learning~(FL), a privacy-preserving distributed learning paradigm, offers a compelling alternative by enabling collaborative model training without the need for data sharing. Howe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04837v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04837v1-abstract-full" style="display: none;"> Current deep learning (DL)-based palmprint verification models rely on centralized training with large datasets, which raises significant privacy concerns due to biometric data's sensitive and immutable nature. Federated learning~(FL), a privacy-preserving distributed learning paradigm, offers a compelling alternative by enabling collaborative model training without the need for data sharing. However, FL-based palmprint verification faces critical challenges, including data heterogeneity from diverse identities and the absence of standardized evaluation benchmarks. This paper addresses these gaps by establishing a comprehensive benchmark for FL-based palmprint verification, which explicitly defines and evaluates two practical scenarios: closed-set and open-set verification. We propose FedPalm, a unified FL framework that balances local adaptability with global generalization. Each client trains a personalized textural expert tailored to local data and collaboratively contributes to a shared global textural expert for extracting generalized features. To further enhance verification performance, we introduce a Textural Expert Interaction Module that dynamically routes textural features among experts to generate refined side textural features. Learnable parameters are employed to model relationships between original and side features, fostering cross-texture-expert interaction and improving feature discrimination. Extensive experiments validate the effectiveness of FedPalm, demonstrating robust performance across both scenarios and providing a promising foundation for advancing FL-based palmprint verification research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04837v1-abstract-full').style.display = 'none'; document.getElementById('2503.04837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04629">arXiv:2503.04629</a> <span> [<a href="https://arxiv.org/pdf/2503.04629">pdf</a>, <a href="https://arxiv.org/format/2503.04629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SurveyForge: On the Outline Heuristics, Memory-Driven Generation, and Multi-dimensional Evaluation for Automated Survey Writing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shiyang Feng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+R">Renqiu Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04629v1-abstract-short" style="display: inline;"> Survey paper plays a crucial role in scientific research, especially given the rapid growth of research publications. Recently, researchers have begun using LLMs to automate survey generation for better efficiency. However, the quality gap between LLM-generated surveys and those written by human remains significant, particularly in terms of outline quality and citation accuracy. To close these gap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04629v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04629v1-abstract-full" style="display: none;"> Survey paper plays a crucial role in scientific research, especially given the rapid growth of research publications. Recently, researchers have begun using LLMs to automate survey generation for better efficiency. However, the quality gap between LLM-generated surveys and those written by human remains significant, particularly in terms of outline quality and citation accuracy. To close these gaps, we introduce SurveyForge, which first generates the outline by analyzing the logical structure of human-written outlines and referring to the retrieved domain-related articles. Subsequently, leveraging high-quality papers retrieved from memory by our scholar navigation agent, SurveyForge can automatically generate and refine the content of the generated article. Moreover, to achieve a comprehensive evaluation, we construct SurveyBench, which includes 100 human-written survey papers for win-rate comparison and assesses AI-generated survey papers across three dimensions: reference, outline, and content quality. Experiments demonstrate that SurveyForge can outperform previous works such as AutoSurvey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04629v1-abstract-full').style.display = 'none'; document.getElementById('2503.04629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and dataset are available for downloading at: https://github.com/Alpha-Innovator/SurveyForge 22 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04625">arXiv:2503.04625</a> <span> [<a href="https://arxiv.org/pdf/2503.04625">pdf</a>, <a href="https://arxiv.org/format/2503.04625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> START: Self-taught Reasoner with Tools </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengpeng Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mingfeng Xue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenru Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaxi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Beichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04625v2-abstract-short" style="display: inline;"> Large reasoning models (LRMs) like OpenAI-o1 and DeepSeek-R1 have demonstrated remarkable capabilities in complex reasoning tasks through the utilization of long Chain-of-thought (CoT). However, these models often suffer from hallucinations and inefficiencies due to their reliance solely on internal reasoning processes. In this paper, we introduce START (Self-Taught Reasoner with Tools), a novel t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04625v2-abstract-full').style.display = 'inline'; document.getElementById('2503.04625v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04625v2-abstract-full" style="display: none;"> Large reasoning models (LRMs) like OpenAI-o1 and DeepSeek-R1 have demonstrated remarkable capabilities in complex reasoning tasks through the utilization of long Chain-of-thought (CoT). However, these models often suffer from hallucinations and inefficiencies due to their reliance solely on internal reasoning processes. In this paper, we introduce START (Self-Taught Reasoner with Tools), a novel tool-integrated long CoT reasoning LLM that significantly enhances reasoning capabilities by leveraging external tools. Through code execution, START is capable of performing complex computations, self-checking, exploring diverse methods, and self-debugging, thereby addressing the limitations of LRMs. The core innovation of START lies in its self-learning framework, which comprises two key techniques: 1) Hint-infer: We demonstrate that inserting artificially designed hints (e.g., ``Wait, maybe using Python here is a good idea.'') during the inference process of a LRM effectively stimulates its ability to utilize external tools without the need for any demonstration data. Hint-infer can also serve as a simple and effective sequential test-time scaling method; 2) Hint Rejection Sampling Fine-Tuning (Hint-RFT): Hint-RFT combines Hint-infer and RFT by scoring, filtering, and modifying the reasoning trajectories with tool invocation generated by a LRM via Hint-infer, followed by fine-tuning the LRM. Through this framework, we have fine-tuned the QwQ-32B model to achieve START. On PhD-level science QA (GPQA), competition-level math benchmarks (AMC23, AIME24, AIME25), and the competition-level code benchmark (LiveCodeBench), START achieves accuracy rates of 63.6%, 95.0%, 66.7%, 47.1%, and 47.3%, respectively. It significantly outperforms the base QwQ-32B and achieves performance comparable to the state-of-the-art open-weight model R1-Distill-Qwen-32B and the proprietary model o1-Preview. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04625v2-abstract-full').style.display = 'none'; document.getElementById('2503.04625v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38 pages, 5 figures and 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04548">arXiv:2503.04548</a> <span> [<a href="https://arxiv.org/pdf/2503.04548">pdf</a>, <a href="https://arxiv.org/format/2503.04548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study on Eliciting and Improving R1-like Reasoning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yingqian Min</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Beichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Daixuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xu Miao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yang Lu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lei Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04548v1-abstract-short" style="display: inline;"> In this report, we present the third technical report on the development of slow-thinking models as part of the STILL project. As the technical pathway becomes clearer, scaling RL training has become a central technique for implementing such reasoning models. We systematically experiment with and document the effects of various factors influencing RL training, conducting experiments on both base m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04548v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04548v1-abstract-full" style="display: none;"> In this report, we present the third technical report on the development of slow-thinking models as part of the STILL project. As the technical pathway becomes clearer, scaling RL training has become a central technique for implementing such reasoning models. We systematically experiment with and document the effects of various factors influencing RL training, conducting experiments on both base models and fine-tuned models. Specifically, we demonstrate that our RL training approach consistently improves the Qwen2.5-32B base models, enhancing both response length and test accuracy. Furthermore, we show that even when a model like DeepSeek-R1-Distill-Qwen-1.5B has already achieved a high performance level, it can be further refined through RL training, reaching an accuracy of 39.33% on AIME 2024. Beyond RL training, we also explore the use of tool manipulation, finding that it significantly boosts the reasoning performance of large reasoning models. This approach achieves a remarkable accuracy of 86.67% with greedy search on AIME 2024, underscoring its effectiveness in enhancing model capabilities. We release our resources at the STILL project website: https://github.com/RUCAIBox/Slow_Thinking_with_LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04548v1-abstract-full').style.display = 'none'; document.getElementById('2503.04548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report on Slow Thinking with LLMs: Part III</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04165">arXiv:2503.04165</a> <span> [<a href="https://arxiv.org/pdf/2503.04165">pdf</a>, <a href="https://arxiv.org/format/2503.04165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WeakSupCon: Weakly Supervised Contrastive Learning for Encoder Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bodong Zhang</a>, <a href="/search/cs?searchtype=author&query=Manoochehri%2C+H">Hamid Manoochehri</a>, <a href="/search/cs?searchtype=author&query=Knudsen%2C+B+S">Beatrice S. Knudsen</a>, <a href="/search/cs?searchtype=author&query=Tasdizen%2C+T">Tolga Tasdizen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04165v1-abstract-short" style="display: inline;"> Weakly supervised multiple instance learning (MIL) is a challenging task given that only bag-level labels are provided, while each bag typically contains multiple instances. This topic has been extensively studied in histopathological image analysis, where labels are usually available only at the whole slide image (WSI) level, while each whole slide image can be divided into thousands of small ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04165v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04165v1-abstract-full" style="display: none;"> Weakly supervised multiple instance learning (MIL) is a challenging task given that only bag-level labels are provided, while each bag typically contains multiple instances. This topic has been extensively studied in histopathological image analysis, where labels are usually available only at the whole slide image (WSI) level, while each whole slide image can be divided into thousands of small image patches for training. The dominant MIL approaches take fixed patch features as inputs to address computational constraints and ensure model stability. These features are commonly generated by encoders pre-trained on ImageNet, foundation encoders pre-trained on large datasets, or through self-supervised learning on local datasets. While the self-supervised encoder pre-training on the same dataset as downstream MIL tasks helps mitigate domain shift and generate better features, the bag-level labels are not utilized during the process, and the features of patches from different categories may cluster together, reducing classification performance on MIL tasks. Recently, pre-training with supervised contrastive learning (SupCon) has demonstrated superior performance compared to self-supervised contrastive learning and even end-to-end training on traditional image classification tasks. In this paper, we propose a novel encoder pre-training method for downstream MIL tasks called Weakly Supervised Contrastive Learning (WeakSupCon) that utilizes bag-level labels. In our method, we employ multi-task learning and define distinct contrastive learning losses for samples with different bag labels. Our experiments demonstrate that the features generated using WeakSupCon significantly enhance MIL classification performance compared to self-supervised approaches across three datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04165v1-abstract-full').style.display = 'none'; document.getElementById('2503.04165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03480">arXiv:2503.03480</a> <span> [<a href="https://arxiv.org/pdf/2503.03480">pdf</a>, <a href="https://arxiv.org/format/2503.03480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SafeVLA: Towards Safety Alignment of Vision-Language-Action Model via Safe Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Borong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiaming Ji</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yingshan Lei</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Josef Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuanpei Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03480v1-abstract-short" style="display: inline;"> Vision-language-action models (VLAs) have shown great potential as generalist robot policies. However, these models pose urgent safety challenges during deployment, including the risk of physical harm to the environment, the robot itself, and humans. How can safety be explicitly incorporated into VLAs? In this work, we propose SafeVLA, a novel algorithm designed to integrate safety into VLAs, ensu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03480v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03480v1-abstract-full" style="display: none;"> Vision-language-action models (VLAs) have shown great potential as generalist robot policies. However, these models pose urgent safety challenges during deployment, including the risk of physical harm to the environment, the robot itself, and humans. How can safety be explicitly incorporated into VLAs? In this work, we propose SafeVLA, a novel algorithm designed to integrate safety into VLAs, ensuring the protection of the environment, robot hardware and humans in real-world settings. SafeVLA effectively balances safety and task performance by employing large-scale constrained learning within simulated environments. We demonstrate that SafeVLA outperforms the current state-of-the-art method in both safety and task performance, achieving average improvements of 83.58% and 3.85%, respectively, in simulation. By prioritizing safety, our approach eliminates high-risk behaviors and reduces the upper bound of unsafe behaviors to 1/35 of that in the current state-of-the-art, thereby significantly mitigating long-tail risks. Furthermore, the learned safety constraints generalize to diverse, unseen scenarios, including multiple out-of-distribution perturbations and tasks. Our data, models and newly proposed benchmark environment are available at https://sites.google.com/view/pku-safevla. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03480v1-abstract-full').style.display = 'none'; document.getElementById('2503.03480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03114">arXiv:2503.03114</a> <span> [<a href="https://arxiv.org/pdf/2503.03114">pdf</a>, <a href="https://arxiv.org/format/2503.03114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> PromAssistant: Leveraging Large Language Models for Text-to-PromQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingyu Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Miao Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Senyu Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03114v2-abstract-short" style="display: inline;"> With the increasing complexity of modern online service systems, understanding the state and behavior of the systems is essential for ensuring their reliability and stability. Therefore, metric monitoring systems are widely used and become an important infrastructure in online service systems. Engineers usually interact with metrics data by manually writing domain-specific language (DSL) queries t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03114v2-abstract-full').style.display = 'inline'; document.getElementById('2503.03114v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03114v2-abstract-full" style="display: none;"> With the increasing complexity of modern online service systems, understanding the state and behavior of the systems is essential for ensuring their reliability and stability. Therefore, metric monitoring systems are widely used and become an important infrastructure in online service systems. Engineers usually interact with metrics data by manually writing domain-specific language (DSL) queries to achieve various analysis objectives. However, writing these queries can be challenging and time-consuming, as it requires engineers to have high programming skills and understand the context of the system. In this paper, we focus on PromQL, which is the metric query DSL provided by the widely used metric monitoring system Prometheus. We aim to simplify metrics querying by enabling engineers to interact with metrics data in Prometheus through natural language, and we call this task text-to-PromQL. Building upon the insight, this paper proposes PromAssistant, a Large Language Model-based text-to-PromQL framework. PromAssistant first uses a knowledge graph to describe the complex context of an online service system. Then, through the synergistic reasoning of LLMs and the knowledge graph, PromAssistant transforms engineers' natural language questions into PromQL queries. To evaluate PromAssistant, we manually construct the first text-to-PromQL benchmark dataset which contains 280 metric query questions. The experiment results show that PromAssistant is effective in text-to-PromQL and outperforms baseline approaches. To the best of our knowledge, this paper is the first study of text-to-PromQL, and PromAssistant pioneered the DSL generation framework for metric querying and analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03114v2-abstract-full').style.display = 'none'; document.getElementById('2503.03114v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02851">arXiv:2503.02851</a> <span> [<a href="https://arxiv.org/pdf/2503.02851">pdf</a>, <a href="https://arxiv.org/format/2503.02851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Shakespearean Sparks: The Dance of Hallucination and Creativity in LLMs' Decoding Layers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zicong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lu Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02851v1-abstract-short" style="display: inline;"> Large language models (LLMs) are known to hallucinate, a phenomenon often linked to creativity. While previous research has primarily explored this connection through theoretical or qualitative lenses, our work takes a quantitative approach to systematically examine the relationship between hallucination and creativity in LLMs. Given the complex nature of creativity, we propose a narrow definition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02851v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02851v1-abstract-full" style="display: none;"> Large language models (LLMs) are known to hallucinate, a phenomenon often linked to creativity. While previous research has primarily explored this connection through theoretical or qualitative lenses, our work takes a quantitative approach to systematically examine the relationship between hallucination and creativity in LLMs. Given the complex nature of creativity, we propose a narrow definition tailored to LLMs and introduce an evaluation framework, HCL, which quantifies Hallucination and Creativity across different Layers of LLMs during decoding. Our empirical analysis reveals a tradeoff between hallucination and creativity that is consistent across layer depth, model type, and model size. Notably, across different model architectures, we identify a specific layer at each model size that optimally balances this tradeoff. Additionally, the optimal layer tends to appear in the early layers of larger models, and the confidence of the model is also significantly higher at this layer. These findings provide a quantitative perspective that offers new insights into the interplay between LLM creativity and hallucination. The code and data for our experiments are available at https://github.com/ZicongHe2002/HCL-Spark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02851v1-abstract-full').style.display = 'none'; document.getElementById('2503.02851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01976">arXiv:2503.01976</a> <span> [<a href="https://arxiv.org/pdf/2503.01976">pdf</a>, <a href="https://arxiv.org/ps/2503.01976">ps</a>, <a href="https://arxiv.org/format/2503.01976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Learning a Game by Paying the Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B+H">Brian Hu Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tao Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiling Chen</a>, <a href="/search/cs?searchtype=author&query=Sandholm%2C+T">Tuomas Sandholm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01976v1-abstract-short" style="display: inline;"> We study the problem of learning the utility functions of agents in a normal-form game by observing the agents play the game repeatedly. Differing from most prior literature, we introduce a principal with the power to observe the agents playing the game, send the agents signals, and send the agents payments as a function of their actions. Under reasonable behavioral models for the agents such as i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01976v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01976v1-abstract-full" style="display: none;"> We study the problem of learning the utility functions of agents in a normal-form game by observing the agents play the game repeatedly. Differing from most prior literature, we introduce a principal with the power to observe the agents playing the game, send the agents signals, and send the agents payments as a function of their actions. Under reasonable behavioral models for the agents such as iterated dominated action removal or a no-regret assumption, we show that the principal can, using a number of rounds polynomial in the size of the game, learn the utility functions of all agents to any desirable precision $\varepsilon > 0$. We also show lower bounds in both models, which nearly match the upper bounds in the former model and also strictly separate the two models: the principal can learn strictly faster in the iterated dominance model. Finally, we discuss implications for the problem of steering agents to a desired equilibrium: in particular, we introduce, using our utility-learning algorithm as a subroutine, the first algorithm for steering learning agents without prior knowledge of their utilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01976v1-abstract-full').style.display = 'none'; document.getElementById('2503.01976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01448">arXiv:2503.01448</a> <span> [<a href="https://arxiv.org/pdf/2503.01448">pdf</a>, <a href="https://arxiv.org/format/2503.01448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Human Geometry Distribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangjun Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Biao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01448v2-abstract-short" style="display: inline;"> Realistic human geometry generation is an important yet challenging task, requiring both the preservation of fine clothing details and the accurate modeling of clothing-pose interactions. Geometry distributions, which can model the geometry of a single human as a distribution, provide a promising representation for high-fidelity synthesis. However, applying geometry distributions for human generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01448v2-abstract-full').style.display = 'inline'; document.getElementById('2503.01448v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01448v2-abstract-full" style="display: none;"> Realistic human geometry generation is an important yet challenging task, requiring both the preservation of fine clothing details and the accurate modeling of clothing-pose interactions. Geometry distributions, which can model the geometry of a single human as a distribution, provide a promising representation for high-fidelity synthesis. However, applying geometry distributions for human generation requires learning a dataset-level distribution over numerous individual geometry distributions. To address the resulting challenges, we propose a novel 3D human generative framework that, for the first time, models the distribution of human geometry distributions. Our framework operates in two stages: first, generating the human geometry distribution, and second, synthesizing high-fidelity humans by sampling from this distribution. We validate our method on two tasks: pose-conditioned 3D human generation and single-view-based novel pose generation. Experimental results demonstrate that our approach achieves the best quantitative results in terms of realism and geometric fidelity, outperforming state-of-the-art generative methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01448v2-abstract-full').style.display = 'none'; document.getElementById('2503.01448v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01261">arXiv:2503.01261</a> <span> [<a href="https://arxiv.org/pdf/2503.01261">pdf</a>, <a href="https://arxiv.org/format/2503.01261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Improved Text-Aligned Codebook Learning: Multi-Hierarchical Codebook-Text Alignment with Long Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+G">Guotao Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baoquan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zhiyuan Wen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junteng Zhao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yunming Ye</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Kola Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01261v2-abstract-short" style="display: inline;"> Image quantization is a crucial technique in image generation, aimed at learning a codebook that encodes an image into a discrete token sequence. Recent advancements have seen researchers exploring learning multi-modal codebook (i.e., text-aligned codebook) by utilizing image caption semantics, aiming to enhance codebook performance in cross-modal tasks. However, existing image-text paired dataset… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01261v2-abstract-full').style.display = 'inline'; document.getElementById('2503.01261v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01261v2-abstract-full" style="display: none;"> Image quantization is a crucial technique in image generation, aimed at learning a codebook that encodes an image into a discrete token sequence. Recent advancements have seen researchers exploring learning multi-modal codebook (i.e., text-aligned codebook) by utilizing image caption semantics, aiming to enhance codebook performance in cross-modal tasks. However, existing image-text paired datasets exhibit a notable flaw in that the text descriptions tend to be overly concise, failing to adequately describe the images and provide sufficient semantic knowledge, resulting in limited alignment of text and codebook at a fine-grained level. In this paper, we propose a novel Text-Augmented Codebook Learning framework, named TA-VQ, which generates longer text for each image using the visual-language model for improved text-aligned codebook learning. However, the long text presents two key challenges: how to encode text and how to align codebook and text. To tackle two challenges, we propose to split the long text into multiple granularities for encoding, i.e., word, phrase, and sentence, so that the long text can be fully encoded without losing any key semantic knowledge. Following this, a hierarchical encoder and novel sampling-based alignment strategy are designed to achieve fine-grained codebook-text alignment. Additionally, our method can be seamlessly integrated into existing VQ models. Extensive experiments in reconstruction and various downstream tasks demonstrate its effectiveness compared to previous state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01261v2-abstract-full').style.display = 'none'; document.getElementById('2503.01261v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00848">arXiv:2503.00848</a> <span> [<a href="https://arxiv.org/pdf/2503.00848">pdf</a>, <a href="https://arxiv.org/format/2503.00848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PSRGS:Progressive Spectral Residual of 3D Gaussian for High-Frequency Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">BoCheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">WenJuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bing Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">YiLing Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">YaNing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00848v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3D GS) achieves impressive results in novel view synthesis for small, single-object scenes through Gaussian ellipsoid initialization and adaptive density control. However, when applied to large-scale remote sensing scenes, 3D GS faces challenges: the point clouds generated by Structure-from-Motion (SfM) are often sparse, and the inherent smoothing behavior of 3D GS leads to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00848v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00848v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3D GS) achieves impressive results in novel view synthesis for small, single-object scenes through Gaussian ellipsoid initialization and adaptive density control. However, when applied to large-scale remote sensing scenes, 3D GS faces challenges: the point clouds generated by Structure-from-Motion (SfM) are often sparse, and the inherent smoothing behavior of 3D GS leads to over-reconstruction in high-frequency regions, where have detailed textures and color variations. This results in the generation of large, opaque Gaussian ellipsoids that cause gradient artifacts. Moreover, the simultaneous optimization of both geometry and texture may lead to densification of Gaussian ellipsoids at incorrect geometric locations, resulting in artifacts in other views. To address these issues, we propose PSRGS, a progressive optimization scheme based on spectral residual maps. Specifically, we create a spectral residual significance map to separate low-frequency and high-frequency regions. In the low-frequency region, we apply depth-aware and depth-smooth losses to initialize the scene geometry with low threshold. For the high-frequency region, we use gradient features with higher threshold to split and clone ellipsoids, refining the scene. The sampling rate is determined by feature responses and gradient loss. Finally, we introduce a pre-trained network that jointly computes perceptual loss from multiple views, ensuring accurate restoration of high-frequency details in both Gaussian ellipsoids geometry and color. We conduct experiments on multiple datasets to assess the effectiveness of our method, which demonstrates competitive rendering quality, especially in recovering texture details in high-frequency regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00848v1-abstract-full').style.display = 'none'; document.getElementById('2503.00848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00485">arXiv:2503.00485</a> <span> [<a href="https://arxiv.org/pdf/2503.00485">pdf</a>, <a href="https://arxiv.org/format/2503.00485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Homomorphism Expressivity of Spectral Invariant Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gai%2C+J">Jingchu Gai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yiheng Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bohang Zhang</a>, <a href="/search/cs?searchtype=author&query=Maron%2C+H">Haggai Maron</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00485v1-abstract-short" style="display: inline;"> Graph spectra are an important class of structural features on graphs that have shown promising results in enhancing Graph Neural Networks (GNNs). Despite their widespread practical use, the theoretical understanding of the power of spectral invariants -- particularly their contribution to GNNs -- remains incomplete. In this paper, we address this fundamental question through the lens of homomorph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00485v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00485v1-abstract-full" style="display: none;"> Graph spectra are an important class of structural features on graphs that have shown promising results in enhancing Graph Neural Networks (GNNs). Despite their widespread practical use, the theoretical understanding of the power of spectral invariants -- particularly their contribution to GNNs -- remains incomplete. In this paper, we address this fundamental question through the lens of homomorphism expressivity, providing a comprehensive and quantitative analysis of the expressive power of spectral invariants. Specifically, we prove that spectral invariant GNNs can homomorphism-count exactly a class of specific tree-like graphs which we refer to as parallel trees. We highlight the significance of this result in various contexts, including establishing a quantitative expressiveness hierarchy across different architectural variants, offering insights into the impact of GNN depth, and understanding the subgraph counting capabilities of spectral invariant GNNs. In particular, our results significantly extend Arvind et al. (2024) and settle their open questions. Finally, we generalize our analysis to higher-order GNNs and answer an open question raised by Zhang et al. (2024). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00485v1-abstract-full').style.display = 'none'; document.getElementById('2503.00485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2025 Oral </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.21134">arXiv:2502.21134</a> <span> [<a href="https://arxiv.org/pdf/2502.21134">pdf</a>, <a href="https://arxiv.org/format/2502.21134">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamically Local-Enhancement Planner for Large-Scale Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+N">Nanshan Deng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weitao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junze Wen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kun Jiang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhong Cao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diange Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.21134v1-abstract-short" style="display: inline;"> Current autonomous vehicles operate primarily within limited regions, but there is increasing demand for broader applications. However, as models scale, their limited capacity becomes a significant challenge for adapting to novel scenarios. It is increasingly difficult to improve models for new situations using a single monolithic model. To address this issue, we introduce the concept of dynamical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.21134v1-abstract-full').style.display = 'inline'; document.getElementById('2502.21134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.21134v1-abstract-full" style="display: none;"> Current autonomous vehicles operate primarily within limited regions, but there is increasing demand for broader applications. However, as models scale, their limited capacity becomes a significant challenge for adapting to novel scenarios. It is increasingly difficult to improve models for new situations using a single monolithic model. To address this issue, we introduce the concept of dynamically enhancing a basic driving planner with local driving data, without permanently modifying the planner itself. This approach, termed the Dynamically Local-Enhancement (DLE) Planner, aims to improve the scalability of autonomous driving systems without significantly expanding the planner's size. Our approach introduces a position-varying Markov Decision Process formulation coupled with a graph neural network that extracts region-specific driving features from local observation data. The learned features describe the local behavior of the surrounding objects, which is then leveraged to enhance a basic reinforcement learning-based policy. We evaluated our approach in multiple scenarios and compared it with a one-for-all driving model. The results show that our method outperforms the baseline policy in both safety (collision rate) and average reward, while maintaining a lighter scale. This approach has the potential to benefit large-scale autonomous vehicles without the need for largely expanding on-device driving models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.21134v1-abstract-full').style.display = 'none'; document.getElementById('2502.21134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20367">arXiv:2502.20367</a> <span> [<a href="https://arxiv.org/pdf/2502.20367">pdf</a>, <a href="https://arxiv.org/format/2502.20367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> The Role of Tactile Sensing for Learning Reach and Grasp </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boya Zhang</a>, <a href="/search/cs?searchtype=author&query=Andrussow%2C+I">Iris Andrussow</a>, <a href="/search/cs?searchtype=author&query=Zell%2C+A">Andreas Zell</a>, <a href="/search/cs?searchtype=author&query=Martius%2C+G">Georg Martius</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20367v1-abstract-short" style="display: inline;"> Stable and robust robotic grasping is essential for current and future robot applications. In recent works, the use of large datasets and supervised learning has enhanced speed and precision in antipodal grasping. However, these methods struggle with perception and calibration errors due to large planning horizons. To obtain more robust and reactive grasping motions, leveraging reinforcement learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20367v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20367v1-abstract-full" style="display: none;"> Stable and robust robotic grasping is essential for current and future robot applications. In recent works, the use of large datasets and supervised learning has enhanced speed and precision in antipodal grasping. However, these methods struggle with perception and calibration errors due to large planning horizons. To obtain more robust and reactive grasping motions, leveraging reinforcement learning combined with tactile sensing is a promising direction. Yet, there is no systematic evaluation of how the complexity of force-based tactile sensing affects the learning behavior for grasping tasks. This paper compares various tactile and environmental setups using two model-free reinforcement learning approaches for antipodal grasping. Our findings suggest that under imperfect visual perception, various tactile features improve learning outcomes, while complex tactile inputs complicate training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20367v1-abstract-full').style.display = 'none'; document.getElementById('2502.20367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19568">arXiv:2502.19568</a> <span> [<a href="https://arxiv.org/pdf/2502.19568">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> PhenoProfiler: Advancing Phenotypic Learning for Image-based Drug Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bob Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Minghao Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiliang Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shihang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengran Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qianqian Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19568v1-abstract-short" style="display: inline;"> In the field of image-based drug discovery, capturing the phenotypic response of cells to various drug treatments and perturbations is a crucial step. However, existing methods require computationally extensive and complex multi-step procedures, which can introduce inefficiencies, limit generalizability, and increase potential errors. To address these challenges, we present PhenoProfiler, an innov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19568v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19568v1-abstract-full" style="display: none;"> In the field of image-based drug discovery, capturing the phenotypic response of cells to various drug treatments and perturbations is a crucial step. However, existing methods require computationally extensive and complex multi-step procedures, which can introduce inefficiencies, limit generalizability, and increase potential errors. To address these challenges, we present PhenoProfiler, an innovative model designed to efficiently and effectively extract morphological representations, enabling the elucidation of phenotypic changes induced by treatments. PhenoProfiler is designed as an end-to-end tool that processes whole-slide multi-channel images directly into low-dimensional quantitative representations, eliminating the extensive computational steps required by existing methods. It also includes a multi-objective learning module to enhance robustness, accuracy, and generalization in morphological representation learning. PhenoProfiler is rigorously evaluated on large-scale publicly available datasets, including over 230,000 whole-slide multi-channel images in end-to-end scenarios and more than 8.42 million single-cell images in non-end-to-end settings. Across these benchmarks, PhenoProfiler consistently outperforms state-of-the-art methods by up to 20%, demonstrating substantial improvements in both accuracy and robustness. Furthermore, PhenoProfiler uses a tailored phenotype correction strategy to emphasize relative phenotypic changes under treatments, facilitating the detection of biologically meaningful signals. UMAP visualizations of treatment profiles demonstrate PhenoProfiler ability to effectively cluster treatments with similar biological annotations, thereby enhancing interpretability. These findings establish PhenoProfiler as a scalable, generalizable, and robust tool for phenotypic learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19568v1-abstract-full').style.display = 'none'; document.getElementById('2502.19568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18924">arXiv:2502.18924</a> <span> [<a href="https://arxiv.org/pdf/2502.18924">pdf</a>, <a href="https://arxiv.org/format/2502.18924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sparse Alignment Enhanced Latent Diffusion Transformer for Zero-Shot Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Jionghao%2C+B">Bai Jionghao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18924v3-abstract-short" style="display: inline;"> While recent zero-shot text-to-speech (TTS) models have significantly improved speech quality and expressiveness, mainstream systems still suffer from issues related to speech-text alignment modeling: 1) models without explicit speech-text alignment modeling exhibit less robustness, especially for hard sentences in practical applications; 2) predefined alignment-based models suffer from naturalnes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18924v3-abstract-full').style.display = 'inline'; document.getElementById('2502.18924v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18924v3-abstract-full" style="display: none;"> While recent zero-shot text-to-speech (TTS) models have significantly improved speech quality and expressiveness, mainstream systems still suffer from issues related to speech-text alignment modeling: 1) models without explicit speech-text alignment modeling exhibit less robustness, especially for hard sentences in practical applications; 2) predefined alignment-based models suffer from naturalness constraints of forced alignments. This paper introduces \textit{MegaTTS 3}, a TTS system featuring an innovative sparse alignment algorithm that guides the latent diffusion transformer (DiT). Specifically, we provide sparse alignment boundaries to MegaTTS 3 to reduce the difficulty of alignment without limiting the search space, thereby achieving high naturalness. Moreover, we employ a multi-condition classifier-free guidance strategy for accent intensity adjustment and adopt the piecewise rectified flow technique to accelerate the generation process. Experiments demonstrate that MegaTTS 3 achieves state-of-the-art zero-shot TTS speech quality and supports highly flexible control over accent intensity. Notably, our system can generate high-quality one-minute speech with only 8 sampling steps. Audio samples are available at https://sditdemo.github.io/sditdemo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18924v3-abstract-full').style.display = 'none'; document.getElementById('2502.18924v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18858">arXiv:2502.18858</a> <span> [<a href="https://arxiv.org/pdf/2502.18858">pdf</a>, <a href="https://arxiv.org/format/2502.18858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Intelligence via Trial and Error </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jingtao Zhan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiahao Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiaxin Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shaoping Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18858v2-abstract-short" style="display: inline;"> Intelligence is a crucial trait for species to find solutions within a limited number of trial-and-error attempts. Building on this idea, we introduce Survival Game as a framework to evaluate intelligence based on the number of failed attempts in a trial-and-error process. Fewer failures indicate higher intelligence. When the expectation and variance of failure counts are both finite, it signals t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18858v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18858v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18858v2-abstract-full" style="display: none;"> Intelligence is a crucial trait for species to find solutions within a limited number of trial-and-error attempts. Building on this idea, we introduce Survival Game as a framework to evaluate intelligence based on the number of failed attempts in a trial-and-error process. Fewer failures indicate higher intelligence. When the expectation and variance of failure counts are both finite, it signals the ability to consistently find solutions to new challenges, which we define as the Autonomous Level of intelligence. Using Survival Game, we comprehensively evaluate existing AI systems. Our results show that while AI systems achieve the Autonomous Level in simple tasks, they are still far from it in more complex tasks, such as vision, search, recommendation, and language. While scaling current AI technologies might help, this would come at an astronomical cost. Projections suggest that achieving the Autonomous Level for general tasks would require $10^{26}$ parameters. To put this into perspective, loading such a massive model requires so many H100 GPUs that their total value is $10^{7}$ times that of Apple Inc.'s market value. Even with Moore's Law, supporting such a parameter scale would take $70$ years. This staggering cost highlights the complexity of human tasks and the inadequacies of current AI technologies. To further investigate this phenomenon, we conduct a theoretical analysis of Survival Game and its experimental results. Our findings suggest that human tasks possess a criticality property. As a result, Autonomous Level requires a deep understanding of the task's underlying mechanisms. Current AI systems, however, do not fully grasp these mechanisms and instead rely on superficial mimicry, making it difficult for them to reach an autonomous level. We believe Survival Game can not only guide the future development of AI but also offer profound insights into human intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18858v2-abstract-full').style.display = 'none'; document.getElementById('2502.18858v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18605">arXiv:2502.18605</a> <span> [<a href="https://arxiv.org/pdf/2502.18605">pdf</a>, <a href="https://arxiv.org/format/2502.18605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Expected Variational Inequalities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B+H">Brian Hu Zhang</a>, <a href="/search/cs?searchtype=author&query=Anagnostides%2C+I">Ioannis Anagnostides</a>, <a href="/search/cs?searchtype=author&query=Tewolde%2C+E">Emanuel Tewolde</a>, <a href="/search/cs?searchtype=author&query=Berker%2C+R+E">Ratip Emin Berker</a>, <a href="/search/cs?searchtype=author&query=Farina%2C+G">Gabriele Farina</a>, <a href="/search/cs?searchtype=author&query=Conitzer%2C+V">Vincent Conitzer</a>, <a href="/search/cs?searchtype=author&query=Sandholm%2C+T">Tuomas Sandholm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18605v2-abstract-short" style="display: inline;"> Variational inequalities (VIs) encompass many fundamental problems in diverse areas ranging from engineering to economics and machine learning. However, their considerable expressivity comes at the cost of computational intractability. In this paper, we introduce and analyze a natural relaxation -- which we refer to as expected variational inequalities (EVIs) -- where the goal is to find a distrib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18605v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18605v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18605v2-abstract-full" style="display: none;"> Variational inequalities (VIs) encompass many fundamental problems in diverse areas ranging from engineering to economics and machine learning. However, their considerable expressivity comes at the cost of computational intractability. In this paper, we introduce and analyze a natural relaxation -- which we refer to as expected variational inequalities (EVIs) -- where the goal is to find a distribution that satisfies the VI constraint in expectation. By adapting recent techniques from game theory, we show that, unlike VIs, EVIs can be solved in polynomial time under general (nonmonotone) operators. EVIs capture the seminal notion of correlated equilibria, but enjoy a greater reach beyond games. We also employ our framework to capture and generalize several existing disparate results, including from settings such as smooth games, and games with coupled constraints or nonconcave utilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18605v2-abstract-full').style.display = 'none'; document.getElementById('2502.18605v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">V2 expands on the related work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18582">arXiv:2502.18582</a> <span> [<a href="https://arxiv.org/pdf/2502.18582">pdf</a>, <a href="https://arxiv.org/ps/2502.18582">ps</a>, <a href="https://arxiv.org/format/2502.18582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning and Computation of $桅$-Equilibria at the Frontier of Tractability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B+H">Brian Hu Zhang</a>, <a href="/search/cs?searchtype=author&query=Anagnostides%2C+I">Ioannis Anagnostides</a>, <a href="/search/cs?searchtype=author&query=Tewolde%2C+E">Emanuel Tewolde</a>, <a href="/search/cs?searchtype=author&query=Berker%2C+R+E">Ratip Emin Berker</a>, <a href="/search/cs?searchtype=author&query=Farina%2C+G">Gabriele Farina</a>, <a href="/search/cs?searchtype=author&query=Conitzer%2C+V">Vincent Conitzer</a>, <a href="/search/cs?searchtype=author&query=Sandholm%2C+T">Tuomas Sandholm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18582v2-abstract-short" style="display: inline;"> $桅$-equilibria -- and the associated notion of $桅$-regret -- are a powerful and flexible framework at the heart of online learning and game theory, whereby enriching the set of deviations $桅$ begets stronger notions of rationality. Recently, Daskalakis, Farina, Fishelson, Pipis, and Schneider (STOC '24) -- abbreviated as DFFPS -- settled the existence of efficient algorithms when $桅… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18582v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18582v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18582v2-abstract-full" style="display: none;"> $桅$-equilibria -- and the associated notion of $桅$-regret -- are a powerful and flexible framework at the heart of online learning and game theory, whereby enriching the set of deviations $桅$ begets stronger notions of rationality. Recently, Daskalakis, Farina, Fishelson, Pipis, and Schneider (STOC '24) -- abbreviated as DFFPS -- settled the existence of efficient algorithms when $桅$ contains only linear maps under a general, $d$-dimensional convex constraint set $\mathcal{X}$. In this paper, we significantly extend their work by resolving the case where $桅$ is $k$-dimensional; degree-$\ell$ polynomials constitute a canonical such example with $k = d^{O(\ell)}$. In particular, positing only oracle access to $\mathcal{X}$, we obtain two main positive results: i) a $\text{poly}(n, d, k, \text{log}(1/蔚))$-time algorithm for computing $蔚$-approximate $桅$-equilibria in $n$-player multilinear games, and ii) an efficient online algorithm that incurs average $桅$-regret at most $蔚$ using $\text{poly}(d, k)/蔚^2$ rounds. We also show nearly matching lower bounds in the online learning setting, thereby obtaining for the first time a family of deviations that captures the learnability of $桅$-regret. From a technical standpoint, we extend the framework of DFFPS from linear maps to the more challenging case of maps with polynomial dimension. At the heart of our approach is a polynomial-time algorithm for computing an expected fixed point of any $蠁: \mathcal{X} \to \mathcal{X}$ based on the ellipsoid against hope (EAH) algorithm of Papadimitriou and Roughgarden (JACM '08). In particular, our algorithm for computing $桅$-equilibria is based on executing EAH in a nested fashion -- each step of EAH itself being implemented by invoking a separate call to EAH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18582v2-abstract-full').style.display = 'none'; document.getElementById('2502.18582v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18527">arXiv:2502.18527</a> <span> [<a href="https://arxiv.org/pdf/2502.18527">pdf</a>, <a href="https://arxiv.org/format/2502.18527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GOD model: Privacy Preserved AI School for Personal Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=PIN+AI+Team"> PIN AI Team</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Bill Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Gavin Guo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+R">Regan Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shouqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Florescu%2C+L">Laura Florescu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Crapis%2C+D">Davide Crapis</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Ben Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18527v2-abstract-short" style="display: inline;"> Personal AI assistants (e.g., Apple Intelligence, Meta AI) offer proactive recommendations that simplify everyday tasks, but their reliance on sensitive user data raises concerns about privacy and trust. To address these challenges, we introduce the Guardian of Data (GOD), a secure, privacy-preserving framework for training and evaluating AI assistants directly on-device. Unlike traditional benchm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18527v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18527v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18527v2-abstract-full" style="display: none;"> Personal AI assistants (e.g., Apple Intelligence, Meta AI) offer proactive recommendations that simplify everyday tasks, but their reliance on sensitive user data raises concerns about privacy and trust. To address these challenges, we introduce the Guardian of Data (GOD), a secure, privacy-preserving framework for training and evaluating AI assistants directly on-device. Unlike traditional benchmarks, the GOD model measures how well assistants can anticipate user needs-such as suggesting gifts-while protecting user data and autonomy. Functioning like an AI school, it addresses the cold start problem by simulating user queries and employing a curriculum-based approach to refine the performance of each assistant. Running within a Trusted Execution Environment (TEE), it safeguards user data while applying reinforcement and imitation learning to refine AI recommendations. A token-based incentive system encourages users to share data securely, creating a data flywheel that drives continuous improvement. Specifically, users mine with their data, and the mining rate is determined by GOD's evaluation of how well their AI assistant understands them across categories such as shopping, social interactions, productivity, trading, and Web3. By integrating privacy, personalization, and trust, the GOD model provides a scalable, responsible path for advancing personal AI assistants. For community collaboration, part of the framework is open-sourced at https://github.com/PIN-AI/God-Model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18527v2-abstract-full').style.display = 'none'; document.getElementById('2502.18527v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository