CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,043 results for author: <span class="mathjax">Wang, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10345">arXiv:2503.10345</a> <span> [<a href="https://arxiv.org/pdf/2503.10345">pdf</a>, <a href="https://arxiv.org/format/2503.10345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Mirror Online Conformal Prediction with Intermittent Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bowen Wang</a>, <a href="/search/cs?searchtype=author&query=Zecchin%2C+M">Matteo Zecchin</a>, <a href="/search/cs?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10345v1-abstract-short" style="display: inline;"> Online conformal prediction enables the runtime calibration of a pre-trained artificial intelligence model using feedback on its performance. Calibration is achieved through set predictions that are updated via online rules so as to ensure long-term coverage guarantees. While recent research has demonstrated the benefits of incorporating prior knowledge into the calibration process, this has come… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10345v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10345v1-abstract-full" style="display: none;"> Online conformal prediction enables the runtime calibration of a pre-trained artificial intelligence model using feedback on its performance. Calibration is achieved through set predictions that are updated via online rules so as to ensure long-term coverage guarantees. While recent research has demonstrated the benefits of incorporating prior knowledge into the calibration process, this has come at the cost of replacing coverage guarantees with less tangible regret guarantees based on the quantile loss. This work introduces intermittent mirror online conformal prediction (IM-OCP), a novel runtime calibration framework that integrates prior knowledge, while maintaining long-term coverage and achieving sub-linear regret. IM-OCP features closed-form updates with minimal memory complexity, and is designed to operate under potentially intermittent feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10345v1-abstract-full').style.display = 'none'; document.getElementById('2503.10345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10170">arXiv:2503.10170</a> <span> [<a href="https://arxiv.org/pdf/2503.10170">pdf</a>, <a href="https://arxiv.org/format/2503.10170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GS-SDF: LiDAR-Augmented Gaussian Splatting and Neural SDF for Geometrically Consistent Rendering and Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yunfei Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bowen Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunran Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiarong Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10170v1-abstract-short" style="display: inline;"> Digital twins are fundamental to the development of autonomous driving and embodied artificial intelligence. However, achieving high-granularity surface reconstruction and high-fidelity rendering remains a challenge. Gaussian splatting offers efficient photorealistic rendering but struggles with geometric inconsistencies due to fragmented primitives and sparse observational data in robotics applic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10170v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10170v1-abstract-full" style="display: none;"> Digital twins are fundamental to the development of autonomous driving and embodied artificial intelligence. However, achieving high-granularity surface reconstruction and high-fidelity rendering remains a challenge. Gaussian splatting offers efficient photorealistic rendering but struggles with geometric inconsistencies due to fragmented primitives and sparse observational data in robotics applications. Existing regularization methods, which rely on render-derived constraints, often fail in complex environments. Moreover, effectively integrating sparse LiDAR data with Gaussian splatting remains challenging. We propose a unified LiDAR-visual system that synergizes Gaussian splatting with a neural signed distance field. The accurate LiDAR point clouds enable a trained neural signed distance field to offer a manifold geometry field, This motivates us to offer an SDF-based Gaussian initialization for physically grounded primitive placement and a comprehensive geometric regularization for geometrically consistent rendering and reconstruction. Experiments demonstrate superior reconstruction accuracy and rendering quality across diverse trajectories. To benefit the community, the codes will be released at https://github.com/hku-mars/GS-SDF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10170v1-abstract-full').style.display = 'none'; document.getElementById('2503.10170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09642">arXiv:2503.09642</a> <span> [<a href="https://arxiv.org/pdf/2503.09642">pdf</a>, <a href="https://arxiv.org/format/2503.09642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open-Sora 2.0: Training a Commercial-Level Video Generation Model in $200k </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiangyu Peng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zangwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenhui Shen</a>, <a href="/search/cs?searchtype=author&query=Young%2C+T">Tom Young</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xinying Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binluo Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Mingyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhui Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+A">Anbang Ye</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+G">Gang Ren</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianran Ma</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wanying Liang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xiang Lian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiwen Wu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yuting Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuangyan Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chaoyu Gong</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+G">Guojun Lei</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Leijun Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Limin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minghao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijie Zhang</a> , et al. (7 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09642v1-abstract-short" style="display: inline;"> Video generation models have achieved remarkable progress in the past year. The quality of AI video continues to improve, but at the cost of larger model size, increased data quantity, and greater demand for training compute. In this report, we present Open-Sora 2.0, a commercial-level video generation model trained for only $200k. With this model, we demonstrate that the cost of training a top-pe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09642v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09642v1-abstract-full" style="display: none;"> Video generation models have achieved remarkable progress in the past year. The quality of AI video continues to improve, but at the cost of larger model size, increased data quantity, and greater demand for training compute. In this report, we present Open-Sora 2.0, a commercial-level video generation model trained for only $200k. With this model, we demonstrate that the cost of training a top-performing video generation model is highly controllable. We detail all techniques that contribute to this efficiency breakthrough, including data curation, model architecture, training strategy, and system optimization. According to human evaluation results and VBench scores, Open-Sora 2.0 is comparable to global leading video generation models including the open-source HunyuanVideo and the closed-source Runway Gen-3 Alpha. By making Open-Sora 2.0 fully open-source, we aim to democratize access to advanced video generation technology, fostering broader innovation and creativity in content creation. All resources are publicly available at: https://github.com/hpcaitech/Open-Sora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09642v1-abstract-full').style.display = 'none'; document.getElementById('2503.09642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08317">arXiv:2503.08317</a> <span> [<a href="https://arxiv.org/pdf/2503.08317">pdf</a>, <a href="https://arxiv.org/format/2503.08317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Uni-Gaussians: Unifying Camera and Lidar Simulation with Gaussians for Dynamic Driving Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zikang Yuan</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yuechuan Pu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hongcheng Luo</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+F">Fengtian Lang</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+C">Cheng Chi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yingying Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08317v1-abstract-short" style="display: inline;"> Ensuring the safety of autonomous vehicles necessitates comprehensive simulation of multi-sensor data, encompassing inputs from both cameras and LiDAR sensors, across various dynamic driving scenarios. Neural rendering techniques, which utilize collected raw sensor data to simulate these dynamic environments, have emerged as a leading methodology. While NeRF-based approaches can uniformly represen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08317v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08317v1-abstract-full" style="display: none;"> Ensuring the safety of autonomous vehicles necessitates comprehensive simulation of multi-sensor data, encompassing inputs from both cameras and LiDAR sensors, across various dynamic driving scenarios. Neural rendering techniques, which utilize collected raw sensor data to simulate these dynamic environments, have emerged as a leading methodology. While NeRF-based approaches can uniformly represent scenes for rendering data from both camera and LiDAR, they are hindered by slow rendering speeds due to dense sampling. Conversely, Gaussian Splatting-based methods employ Gaussian primitives for scene representation and achieve rapid rendering through rasterization. However, these rasterization-based techniques struggle to accurately model non-linear optical sensors. This limitation restricts their applicability to sensors beyond pinhole cameras. To address these challenges and enable unified representation of dynamic driving scenarios using Gaussian primitives, this study proposes a novel hybrid approach. Our method utilizes rasterization for rendering image data while employing Gaussian ray-tracing for LiDAR data rendering. Experimental results on public datasets demonstrate that our approach outperforms current state-of-the-art methods. This work presents a unified and efficient solution for realistic simulation of camera and LiDAR data in autonomous driving scenarios using Gaussian primitives, offering significant advancements in both rendering quality and computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08317v1-abstract-full').style.display = 'none'; document.getElementById('2503.08317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08292">arXiv:2503.08292</a> <span> [<a href="https://arxiv.org/pdf/2503.08292">pdf</a>, <a href="https://arxiv.org/format/2503.08292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Outpatient Referral: Problem Definition, Benchmarking and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxiao Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qingying Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junying Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiangyi Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiangbo Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bairui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xiang Wan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jian Chang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guangjun Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08292v1-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly applied to outpatient referral tasks across healthcare systems. However, there is a lack of standardized evaluation criteria to assess their effectiveness, particularly in dynamic, interactive scenarios. In this study, we systematically examine the capabilities and limitations of LLMs in managing tasks within Intelligent Outpatient Referral (IOR) syste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08292v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08292v1-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly applied to outpatient referral tasks across healthcare systems. However, there is a lack of standardized evaluation criteria to assess their effectiveness, particularly in dynamic, interactive scenarios. In this study, we systematically examine the capabilities and limitations of LLMs in managing tasks within Intelligent Outpatient Referral (IOR) systems and propose a comprehensive evaluation framework specifically designed for such systems. This framework comprises two core tasks: static evaluation, which focuses on evaluating the ability of predefined outpatient referrals, and dynamic evaluation, which evaluates capabilities of refining outpatient referral recommendations through iterative dialogues. Our findings suggest that LLMs offer limited advantages over BERT-like models, but show promise in asking effective questions during interactive dialogues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08292v1-abstract-full').style.display = 'none'; document.getElementById('2503.08292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08162">arXiv:2503.08162</a> <span> [<a href="https://arxiv.org/pdf/2503.08162">pdf</a>, <a href="https://arxiv.org/format/2503.08162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FASIONAD++ : Integrating High-Level Instruction and Information Bottleneck in FAt-Slow fusION Systems for Enhanced Safety in Autonomous Driving with Adaptive Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+K">Kangan Qian</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziang Luo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sicong Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zilin Huang</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+J">Jinyu Miao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhikun Ma</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tianze Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayin Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zheng Fu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yining Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyue Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hezhe Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiangbo Yu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xinyu Jiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mengmeng Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kun Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diange Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08162v1-abstract-short" style="display: inline;"> Ensuring safe, comfortable, and efficient planning is crucial for autonomous driving systems. While end-to-end models trained on large datasets perform well in standard driving scenarios, they struggle with complex low-frequency events. Recent Large Language Models (LLMs) and Vision Language Models (VLMs) advancements offer enhanced reasoning but suffer from computational inefficiency. Inspired by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08162v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08162v1-abstract-full" style="display: none;"> Ensuring safe, comfortable, and efficient planning is crucial for autonomous driving systems. While end-to-end models trained on large datasets perform well in standard driving scenarios, they struggle with complex low-frequency events. Recent Large Language Models (LLMs) and Vision Language Models (VLMs) advancements offer enhanced reasoning but suffer from computational inefficiency. Inspired by the dual-process cognitive model "Thinking, Fast and Slow", we propose $\textbf{FASIONAD}$ -- a novel dual-system framework that synergizes a fast end-to-end planner with a VLM-based reasoning module. The fast system leverages end-to-end learning to achieve real-time trajectory generation in common scenarios, while the slow system activates through uncertainty estimation to perform contextual analysis and complex scenario resolution. Our architecture introduces three key innovations: (1) A dynamic switching mechanism enabling slow system intervention based on real-time uncertainty assessment; (2) An information bottleneck with high-level plan feedback that optimizes the slow system's guidance capability; (3) A bidirectional knowledge exchange where visual prompts enhance the slow system's reasoning while its feedback refines the fast planner's decision-making. To strengthen VLM reasoning, we develop a question-answering mechanism coupled with reward-instruct training strategy. In open-loop experiments, FASIONAD achieves a $6.7\%$ reduction in average $L2$ trajectory error and $28.1\%$ lower collision rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08162v1-abstract-full').style.display = 'none'; document.getElementById('2503.08162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07920">arXiv:2503.07920</a> <span> [<a href="https://arxiv.org/pdf/2503.07920">pdf</a>, <a href="https://arxiv.org/format/2503.07920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Crowdsource, Crawl, or Generate? Creating SEA-VL, a Multicultural Vision-Language Dataset for Southeast Asia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cahyawijaya%2C+S">Samuel Cahyawijaya</a>, <a href="/search/cs?searchtype=author&query=Lovenia%2C+H">Holy Lovenia</a>, <a href="/search/cs?searchtype=author&query=Moniz%2C+J+R+A">Joel Ruben Antony Moniz</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+T+H">Tack Hwa Wong</a>, <a href="/search/cs?searchtype=author&query=Farhansyah%2C+M+R">Mohammad Rifqi Farhansyah</a>, <a href="/search/cs?searchtype=author&query=Maung%2C+T+T">Thant Thiri Maung</a>, <a href="/search/cs?searchtype=author&query=Hudi%2C+F">Frederikus Hudi</a>, <a href="/search/cs?searchtype=author&query=Anugraha%2C+D">David Anugraha</a>, <a href="/search/cs?searchtype=author&query=Habibi%2C+M+R+S">Muhammad Ravi Shulthan Habibi</a>, <a href="/search/cs?searchtype=author&query=Qorib%2C+M+R">Muhammad Reza Qorib</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+A">Amit Agarwal</a>, <a href="/search/cs?searchtype=author&query=Imperial%2C+J+M">Joseph Marvin Imperial</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+H+L">Hitesh Laxmichand Patel</a>, <a href="/search/cs?searchtype=author&query=Feliren%2C+V">Vicky Feliren</a>, <a href="/search/cs?searchtype=author&query=Nasution%2C+B+I">Bahrul Ilmi Nasution</a>, <a href="/search/cs?searchtype=author&query=Rufino%2C+M+A">Manuel Antonio Rufino</a>, <a href="/search/cs?searchtype=author&query=Winata%2C+G+I">Genta Indra Winata</a>, <a href="/search/cs?searchtype=author&query=Rajagede%2C+R+A">Rian Adam Rajagede</a>, <a href="/search/cs?searchtype=author&query=Catalan%2C+C+R">Carlos Rafael Catalan</a>, <a href="/search/cs?searchtype=author&query=Imam%2C+M+F">Mohamed Fazli Imam</a>, <a href="/search/cs?searchtype=author&query=Pattnayak%2C+P">Priyaranjan Pattnayak</a>, <a href="/search/cs?searchtype=author&query=Pranida%2C+S+Z">Salsabila Zahirah Pranida</a>, <a href="/search/cs?searchtype=author&query=Pratama%2C+K">Kevin Pratama</a>, <a href="/search/cs?searchtype=author&query=Bangera%2C+Y">Yeshil Bangera</a>, <a href="/search/cs?searchtype=author&query=Na-Thalang%2C+A">Adisai Na-Thalang</a> , et al. (67 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07920v1-abstract-short" style="display: inline;"> Southeast Asia (SEA) is a region of extraordinary linguistic and cultural diversity, yet it remains significantly underrepresented in vision-language (VL) research. This often results in artificial intelligence (AI) models that fail to capture SEA cultural nuances. To fill this gap, we present SEA-VL, an open-source initiative dedicated to developing high-quality, culturally relevant data for SEA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07920v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07920v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07920v1-abstract-full" style="display: none;"> Southeast Asia (SEA) is a region of extraordinary linguistic and cultural diversity, yet it remains significantly underrepresented in vision-language (VL) research. This often results in artificial intelligence (AI) models that fail to capture SEA cultural nuances. To fill this gap, we present SEA-VL, an open-source initiative dedicated to developing high-quality, culturally relevant data for SEA languages. By involving contributors from SEA countries, SEA-VL aims to ensure better cultural relevance and diversity, fostering greater inclusivity of underrepresented languages in VL research. Beyond crowdsourcing, our initiative goes one step further in the exploration of the automatic collection of culturally relevant images through crawling and image generation. First, we find that image crawling achieves approximately ~85% cultural relevance while being more cost- and time-efficient than crowdsourcing. Second, despite the substantial progress in generative vision models, synthetic images remain unreliable in accurately reflecting SEA cultures. The generated images often fail to reflect the nuanced traditions and cultural contexts of the region. Collectively, we gather 1.28M SEA culturally-relevant images, more than 50 times larger than other existing datasets. Through SEA-VL, we aim to bridge the representation gap in SEA, fostering the development of more inclusive AI systems that authentically represent diverse cultures across SEA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07920v1-abstract-full').style.display = 'none'; document.getElementById('2503.07920v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SEA-VL Dataset: https://huggingface.co/collections/SEACrowd/sea-vl-multicultural-vl-dataset-for-southeast-asia-67cf223d0c341d4ba2b236e7</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07883">arXiv:2503.07883</a> <span> [<a href="https://arxiv.org/pdf/2503.07883">pdf</a>, <a href="https://arxiv.org/format/2503.07883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Cross-platform Prediction of Depression Treatment Outcome Using Location Sensory Data on Smartphones </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahoo%2C+S">Soumyashree Sahoo</a>, <a href="/search/cs?searchtype=author&query=Shende%2C+C">Chinmaey Shende</a>, <a href="/search/cs?searchtype=author&query=Hossain%2C+M+Z">Md. Zakir Hossain</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+P">Parit Patel</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yushuo Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ware%2C+S">Shweta Ware</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+J">Jinbo Bi</a>, <a href="/search/cs?searchtype=author&query=Kamath%2C+J">Jayesh Kamath</a>, <a href="/search/cs?searchtype=author&query=Russel%2C+A">Alexander Russel</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dongjin Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07883v1-abstract-short" style="display: inline;"> Currently, depression treatment relies on closely monitoring patients response to treatment and adjusting the treatment as needed. Using self-reported or physician-administrated questionnaires to monitor treatment response is, however, burdensome, costly and suffers from recall bias. In this paper, we explore using location sensory data collected passively on smartphones to predict treatment outco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07883v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07883v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07883v1-abstract-full" style="display: none;"> Currently, depression treatment relies on closely monitoring patients response to treatment and adjusting the treatment as needed. Using self-reported or physician-administrated questionnaires to monitor treatment response is, however, burdensome, costly and suffers from recall bias. In this paper, we explore using location sensory data collected passively on smartphones to predict treatment outcome. To address heterogeneous data collection on Android and iOS phones, the two predominant smartphone platforms, we explore using domain adaptation techniques to map their data to a common feature space, and then use the data jointly to train machine learning models. Our results show that this domain adaptation approach can lead to significantly better prediction than that with no domain adaptation. In addition, our results show that using location features and baseline self-reported questionnaire score can lead to F1 score up to 0.67, comparable to that obtained using periodic self-reported questionnaires, indicating that using location data is a promising direction for predicting depression treatment outcome. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07883v1-abstract-full').style.display = 'none'; document.getElementById('2503.07883v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07234">arXiv:2503.07234</a> <span> [<a href="https://arxiv.org/pdf/2503.07234">pdf</a>, <a href="https://arxiv.org/format/2503.07234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CoT-Drive: Efficient Motion Forecasting for Autonomous Driving with LLMs and Chain-of-Thought Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+H">Haicheng Liao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">Hanlin Kong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bonan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengyue Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wang Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhengbing He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenning Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07234v1-abstract-short" style="display: inline;"> Accurate motion forecasting is crucial for safe autonomous driving (AD). This study proposes CoT-Drive, a novel approach that enhances motion forecasting by leveraging large language models (LLMs) and a chain-of-thought (CoT) prompting method. We introduce a teacher-student knowledge distillation strategy to effectively transfer LLMs' advanced scene understanding capabilities to lightweight langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07234v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07234v1-abstract-full" style="display: none;"> Accurate motion forecasting is crucial for safe autonomous driving (AD). This study proposes CoT-Drive, a novel approach that enhances motion forecasting by leveraging large language models (LLMs) and a chain-of-thought (CoT) prompting method. We introduce a teacher-student knowledge distillation strategy to effectively transfer LLMs' advanced scene understanding capabilities to lightweight language models (LMs), ensuring that CoT-Drive operates in real-time on edge devices while maintaining comprehensive scene understanding and generalization capabilities. By leveraging CoT prompting techniques for LLMs without additional training, CoT-Drive generates semantic annotations that significantly improve the understanding of complex traffic environments, thereby boosting the accuracy and robustness of predictions. Additionally, we present two new scene description datasets, Highway-Text and Urban-Text, designed for fine-tuning lightweight LMs to generate context-specific semantic annotations. Comprehensive evaluations of five real-world datasets demonstrate that CoT-Drive outperforms existing models, highlighting its effectiveness and efficiency in handling complex traffic scenarios. Overall, this study is the first to consider the practical application of LLMs in this field. It pioneers the training and use of a lightweight LLM surrogate for motion forecasting, setting a new benchmark and showcasing the potential of integrating LLMs into AD systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07234v1-abstract-full').style.display = 'none'; document.getElementById('2503.07234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07125">arXiv:2503.07125</a> <span> [<a href="https://arxiv.org/pdf/2503.07125">pdf</a>, <a href="https://arxiv.org/format/2503.07125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning A Zero-shot Occupancy Network from Vision Foundation Models via Self-supervised Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+S">Sihao Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Daqi Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+R">Ruochong Fu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongrui Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+A">Andy Song</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hongwei Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihui Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07125v1-abstract-short" style="display: inline;"> Estimating the 3D world from 2D monocular images is a fundamental yet challenging task due to the labour-intensive nature of 3D annotations. To simplify label acquisition, this work proposes a novel approach that bridges 2D vision foundation models (VFMs) with 3D tasks by decoupling 3D supervision into an ensemble of image-level primitives, e.g., semantic and geometric components. As a key motivat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07125v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07125v1-abstract-full" style="display: none;"> Estimating the 3D world from 2D monocular images is a fundamental yet challenging task due to the labour-intensive nature of 3D annotations. To simplify label acquisition, this work proposes a novel approach that bridges 2D vision foundation models (VFMs) with 3D tasks by decoupling 3D supervision into an ensemble of image-level primitives, e.g., semantic and geometric components. As a key motivator, we leverage the zero-shot capabilities of vision-language models for image semantics. However, due to the notorious ill-posed problem - multiple distinct 3D scenes can produce identical 2D projections, directly inferring metric depth from a monocular image in a zero-shot manner is unsuitable. In contrast, 2D VFMs provide promising sources of relative depth, which theoretically aligns with metric depth when properly scaled and offset. Thus, we adapt the relative depth derived from VFMs into metric depth by optimising the scale and offset using temporal consistency, also known as novel view synthesis, without access to ground-truth metric depth. Consequently, we project the semantics into 3D space using the reconstructed metric depth, thereby providing 3D supervision. Extensive experiments on nuScenes and SemanticKITTI demonstrate the effectiveness of our framework. For instance, the proposed method surpasses the current state-of-the-art by 3.34% mIoU on nuScenes for voxel occupancy prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07125v1-abstract-full').style.display = 'none'; document.getElementById('2503.07125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06949">arXiv:2503.06949</a> <span> [<a href="https://arxiv.org/pdf/2503.06949">pdf</a>, <a href="https://arxiv.org/format/2503.06949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LexPro-1.0 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haotian Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yanyu Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chaoyue Zhao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyu Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fang Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Lizhen Cui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yonghui Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06949v2-abstract-short" style="display: inline;"> In this report, we introduce our first-generation reasoning model, LexPro-1.0, a large language model designed for the highly specialized Chinese legal domain, offering comprehensive capabilities to meet diverse realistic needs. Existing legal LLMs face two primary challenges. Firstly, their design and evaluation are predominantly driven by computer science perspectives, leading to insufficient in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06949v2-abstract-full').style.display = 'inline'; document.getElementById('2503.06949v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06949v2-abstract-full" style="display: none;"> In this report, we introduce our first-generation reasoning model, LexPro-1.0, a large language model designed for the highly specialized Chinese legal domain, offering comprehensive capabilities to meet diverse realistic needs. Existing legal LLMs face two primary challenges. Firstly, their design and evaluation are predominantly driven by computer science perspectives, leading to insufficient incorporation of legal expertise and logic, which is crucial for high-precision legal applications, such as handling complex prosecutorial tasks. Secondly, these models often underperform due to a lack of comprehensive training data from the legal domain, limiting their ability to effectively address real-world legal scenarios. To address this, we first compile millions of legal documents covering over 20 types of crimes from 31 provinces in China for model training. From the extensive dataset, we further select high-quality for supervised fine-tuning, ensuring enhanced relevance and precision. The model further undergoes large-scale reinforcement learning without additional supervision, emphasizing the enhancement of its reasoning capabilities and explainability. To validate its effectiveness in complex legal applications, we also conduct human evaluations with legal experts. We develop fine-tuned models based on DeepSeek-R1-Distilled versions, available in three dense configurations: 14B, 32B, and 70B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06949v2-abstract-full').style.display = 'none'; document.getElementById('2503.06949v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06505">arXiv:2503.06505</a> <span> [<a href="https://arxiv.org/pdf/2503.06505">pdf</a>, <a href="https://arxiv.org/format/2503.06505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DynamicID: Zero-Shot Multi-ID Image Personalization with Flexible Facial Editability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xirui Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benqi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yikun Li</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+H">Haishun Nan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06505v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image generation have spurred interest in personalized human image generation, which aims to create novel images featuring specific human identities as reference images indicate. Although existing methods achieve high-fidelity identity preservation, they often struggle with limited multi-ID usability and inadequate facial editability. We present DynamicID, a tuning-f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06505v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06505v1-abstract-full" style="display: none;"> Recent advancements in text-to-image generation have spurred interest in personalized human image generation, which aims to create novel images featuring specific human identities as reference images indicate. Although existing methods achieve high-fidelity identity preservation, they often struggle with limited multi-ID usability and inadequate facial editability. We present DynamicID, a tuning-free framework supported by a dual-stage training paradigm that inherently facilitates both single-ID and multi-ID personalized generation with high fidelity and flexible facial editability. Our key innovations include: 1) Semantic-Activated Attention (SAA), which employs query-level activation gating to minimize disruption to the original model when injecting ID features and achieve multi-ID personalization without requiring multi-ID samples during training. 2) Identity-Motion Reconfigurator (IMR), which leverages contrastive learning to effectively disentangle and re-entangle facial motion and identity features, thereby enabling flexible facial editing. Additionally, we have developed a curated VariFace-10k facial dataset, comprising 10k unique individuals, each represented by 35 distinct facial images. Experimental results demonstrate that DynamicID outperforms state-of-the-art methods in identity fidelity, facial editability, and multi-ID personalization capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06505v1-abstract-full').style.display = 'none'; document.getElementById('2503.06505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06340">arXiv:2503.06340</a> <span> [<a href="https://arxiv.org/pdf/2503.06340">pdf</a>, <a href="https://arxiv.org/format/2503.06340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Backdoor Attacks on Discrete Graph Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiawen Wang</a>, <a href="/search/cs?searchtype=author&query=Karim%2C+S">Samin Karim</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuan Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binghui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06340v1-abstract-short" style="display: inline;"> Diffusion models are powerful generative models in continuous data domains such as image and video data. Discrete graph diffusion models (DGDMs) have recently extended them for graph generation, which are crucial in fields like molecule and protein modeling, and obtained the SOTA performance. However, it is risky to deploy DGDMs for safety-critical applications (e.g., drug discovery) without under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06340v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06340v1-abstract-full" style="display: none;"> Diffusion models are powerful generative models in continuous data domains such as image and video data. Discrete graph diffusion models (DGDMs) have recently extended them for graph generation, which are crucial in fields like molecule and protein modeling, and obtained the SOTA performance. However, it is risky to deploy DGDMs for safety-critical applications (e.g., drug discovery) without understanding their security vulnerabilities. In this work, we perform the first study on graph diffusion models against backdoor attacks, a severe attack that manipulates both the training and inference/generation phases in graph diffusion models. We first define the threat model, under which we design the attack such that the backdoored graph diffusion model can generate 1) high-quality graphs without backdoor activation, 2) effective, stealthy, and persistent backdoored graphs with backdoor activation, and 3) graphs that are permutation invariant and exchangeable--two core properties in graph generative models. 1) and 2) are validated via empirical evaluations without and with backdoor defenses, while 3) is validated via theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06340v1-abstract-full').style.display = 'none'; document.getElementById('2503.06340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05511">arXiv:2503.05511</a> <span> [<a href="https://arxiv.org/pdf/2503.05511">pdf</a>, <a href="https://arxiv.org/format/2503.05511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Free Your Hands: Lightweight Relightable Turntable Capture Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiahui Fan</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Beibei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05511v2-abstract-short" style="display: inline;"> Novel view synthesis (NVS) from multiple captured photos of an object is a widely studied problem. Achieving high quality typically requires dense sampling of input views, which can lead to frustrating and tedious manual labor. Manually positioning cameras to maintain an optimal desired distribution can be difficult for humans, and if a good distribution is found, it is not easy to replicate. Addi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05511v2-abstract-full').style.display = 'inline'; document.getElementById('2503.05511v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05511v2-abstract-full" style="display: none;"> Novel view synthesis (NVS) from multiple captured photos of an object is a widely studied problem. Achieving high quality typically requires dense sampling of input views, which can lead to frustrating and tedious manual labor. Manually positioning cameras to maintain an optimal desired distribution can be difficult for humans, and if a good distribution is found, it is not easy to replicate. Additionally, the captured data can suffer from motion blur and defocus due to human error. In this paper, we present a lightweight object capture pipeline to reduce the manual workload and standardize the acquisition setup. We use a consumer turntable to carry the object and a tripod to hold the camera. As the turntable rotates, we automatically capture dense samples from various views and lighting conditions; we can repeat this for several camera positions. This way, we can easily capture hundreds of valid images in several minutes without hands-on effort. However, in the object reference frame, the light conditions vary; this is harmful to a standard NVS method like 3D Gaussian splatting (3DGS) which assumes fixed lighting. We design a neural radiance representation conditioned on light rotations, which addresses this issue and allows relightability as an additional benefit. We demonstrate our pipeline using 3DGS as the underlying framework, achieving competitive quality compared to previous methods with exhaustive acquisition and showcasing its potential for relighting and harmonization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05511v2-abstract-full').style.display = 'none'; document.getElementById('2503.05511v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05150">arXiv:2503.05150</a> <span> [<a href="https://arxiv.org/pdf/2503.05150">pdf</a>, <a href="https://arxiv.org/format/2503.05150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Interpersonal Memory Matters: A New Task for Proactive Dialogue Utilizing Conversational History </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bowen Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenqing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingsong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baoxun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05150v1-abstract-short" style="display: inline;"> Proactive dialogue systems aim to empower chatbots with the capability of leading conversations towards specific targets, thereby enhancing user engagement and service autonomy. Existing systems typically target pre-defined keywords or entities, neglecting user attributes and preferences implicit in dialogue history, hindering the development of long-term user intimacy. To address these challenges… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05150v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05150v1-abstract-full" style="display: none;"> Proactive dialogue systems aim to empower chatbots with the capability of leading conversations towards specific targets, thereby enhancing user engagement and service autonomy. Existing systems typically target pre-defined keywords or entities, neglecting user attributes and preferences implicit in dialogue history, hindering the development of long-term user intimacy. To address these challenges, we take a radical step towards building a more human-like conversational agent by integrating proactive dialogue systems with long-term memory into a unified framework. Specifically, we define a novel task named Memory-aware Proactive Dialogue (MapDia). By decomposing the task, we then propose an automatic data construction method and create the first Chinese Memory-aware Proactive Dataset (ChMapData). Furthermore, we introduce a joint framework based on Retrieval Augmented Generation (RAG), featuring three modules: Topic Summarization, Topic Retrieval, and Proactive Topic-shifting Detection and Generation, designed to steer dialogues towards relevant historical topics at the right time. The effectiveness of our dataset and models is validated through both automatic and human evaluations. We release the open-source framework and dataset at https://github.com/FrontierLabs/MapDia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05150v1-abstract-full').style.display = 'none'; document.getElementById('2503.05150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05085">arXiv:2503.05085</a> <span> [<a href="https://arxiv.org/pdf/2503.05085">pdf</a>, <a href="https://arxiv.org/format/2503.05085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> S2S-Arena, Evaluating Speech2Speech Protocols on Instruction Following with Paralinguistic Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiyu Lin</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+F">Fan Bu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuhao Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05085v1-abstract-short" style="display: inline;"> The rapid development of large language models (LLMs) has brought significant attention to speech models, particularly recent progress in speech2speech protocols supporting speech input and output. However, the existing benchmarks adopt automatic text-based evaluators for evaluating the instruction following ability of these models lack consideration for paralinguistic information in both speech u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05085v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05085v1-abstract-full" style="display: none;"> The rapid development of large language models (LLMs) has brought significant attention to speech models, particularly recent progress in speech2speech protocols supporting speech input and output. However, the existing benchmarks adopt automatic text-based evaluators for evaluating the instruction following ability of these models lack consideration for paralinguistic information in both speech understanding and generation. To address these issues, we introduce S2S-Arena, a novel arena-style S2S benchmark that evaluates instruction-following capabilities with paralinguistic information in both speech-in and speech-out across real-world tasks. We design 154 samples that fused TTS and live recordings in four domains with 21 tasks and manually evaluate existing popular speech models in an arena-style manner. The experimental results show that: (1) in addition to the superior performance of GPT-4o, the speech model of cascaded ASR, LLM, and TTS outperforms the jointly trained model after text-speech alignment in speech2speech protocols; (2) considering paralinguistic information, the knowledgeability of the speech model mainly depends on the LLM backbone, and the multilingual support of that is limited by the speech module; (3) excellent speech models can already understand the paralinguistic information in speech input, but generating appropriate audio with paralinguistic information is still a challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05085v1-abstract-full').style.display = 'none'; document.getElementById('2503.05085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04629">arXiv:2503.04629</a> <span> [<a href="https://arxiv.org/pdf/2503.04629">pdf</a>, <a href="https://arxiv.org/format/2503.04629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SurveyForge: On the Outline Heuristics, Memory-Driven Generation, and Multi-dimensional Evaluation for Automated Survey Writing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shiyang Feng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+R">Renqiu Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04629v1-abstract-short" style="display: inline;"> Survey paper plays a crucial role in scientific research, especially given the rapid growth of research publications. Recently, researchers have begun using LLMs to automate survey generation for better efficiency. However, the quality gap between LLM-generated surveys and those written by human remains significant, particularly in terms of outline quality and citation accuracy. To close these gap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04629v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04629v1-abstract-full" style="display: none;"> Survey paper plays a crucial role in scientific research, especially given the rapid growth of research publications. Recently, researchers have begun using LLMs to automate survey generation for better efficiency. However, the quality gap between LLM-generated surveys and those written by human remains significant, particularly in terms of outline quality and citation accuracy. To close these gaps, we introduce SurveyForge, which first generates the outline by analyzing the logical structure of human-written outlines and referring to the retrieved domain-related articles. Subsequently, leveraging high-quality papers retrieved from memory by our scholar navigation agent, SurveyForge can automatically generate and refine the content of the generated article. Moreover, to achieve a comprehensive evaluation, we construct SurveyBench, which includes 100 human-written survey papers for win-rate comparison and assesses AI-generated survey papers across three dimensions: reference, outline, and content quality. Experiments demonstrate that SurveyForge can outperform previous works such as AutoSurvey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04629v1-abstract-full').style.display = 'none'; document.getElementById('2503.04629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and dataset are available for downloading at: https://github.com/Alpha-Innovator/SurveyForge 22 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04184">arXiv:2503.04184</a> <span> [<a href="https://arxiv.org/pdf/2503.04184">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale AI in Telecom: Charting the Roadmap for Innovation, Scalability, and Enhanced Digital Experiences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shahid%2C+A">Adnan Shahid</a>, <a href="/search/cs?searchtype=author&query=Kliks%2C+A">Adrian Kliks</a>, <a href="/search/cs?searchtype=author&query=Al-Tahmeesschi%2C+A">Ahmed Al-Tahmeesschi</a>, <a href="/search/cs?searchtype=author&query=Elbakary%2C+A">Ahmed Elbakary</a>, <a href="/search/cs?searchtype=author&query=Nikou%2C+A">Alexandros Nikou</a>, <a href="/search/cs?searchtype=author&query=Maatouk%2C+A">Ali Maatouk</a>, <a href="/search/cs?searchtype=author&query=Mokh%2C+A">Ali Mokh</a>, <a href="/search/cs?searchtype=author&query=Kazemi%2C+A">Amirreza Kazemi</a>, <a href="/search/cs?searchtype=author&query=De+Domenico%2C+A">Antonio De Domenico</a>, <a href="/search/cs?searchtype=author&query=Karapantelakis%2C+A">Athanasios Karapantelakis</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bo Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bohao Wang</a>, <a href="/search/cs?searchtype=author&query=Fischione%2C+C">Carlo Fischione</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Issaid%2C+C+B">Chaouki Ben Issaid</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/cs?searchtype=author&query=Chaccour%2C+C">Christina Chaccour</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+C+K">Christo Kurisummoottil Thomas</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+D">Dheeraj Sharma</a>, <a href="/search/cs?searchtype=author&query=Kalogiros%2C+D">Dimitris Kalogiros</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=De+Poorter%2C+E">Eli De Poorter</a> , et al. (110 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04184v1-abstract-short" style="display: inline;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04184v1-abstract-full" style="display: none;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced by modern telecom networks. The paper covers a wide range of topics, from the architecture and deployment strategies of LTMs to their applications in network management, resource allocation, and optimization. It also explores the regulatory, ethical, and standardization considerations for LTMs, offering insights into their future integration into telecom infrastructure. The goal is to provide a comprehensive roadmap for the adoption of LTMs to enhance scalability, performance, and user-centric innovation in telecom networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'none'; document.getElementById('2503.04184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03866">arXiv:2503.03866</a> <span> [<a href="https://arxiv.org/pdf/2503.03866">pdf</a>, <a href="https://arxiv.org/format/2503.03866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Learning to Negotiate via Voluntary Commitment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shuhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+S+G">Sriram Ganapathi Subramanian</a>, <a href="/search/cs?searchtype=author&query=Poupart%2C+P">Pascal Poupart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03866v1-abstract-short" style="display: inline;"> The partial alignment and conflict of autonomous agents lead to mixed-motive scenarios in many real-world applications. However, agents may fail to cooperate in practice even when cooperation yields a better outcome. One well known reason for this failure comes from non-credible commitments. To facilitate commitments among agents for better cooperation, we define Markov Commitment Games (MCGs), a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03866v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03866v1-abstract-full" style="display: none;"> The partial alignment and conflict of autonomous agents lead to mixed-motive scenarios in many real-world applications. However, agents may fail to cooperate in practice even when cooperation yields a better outcome. One well known reason for this failure comes from non-credible commitments. To facilitate commitments among agents for better cooperation, we define Markov Commitment Games (MCGs), a variant of commitment games, where agents can voluntarily commit to their proposed future plans. Based on MCGs, we propose a learnable commitment protocol via policy gradients. We further propose incentive-compatible learning to accelerate convergence to equilibria with better social welfare. Experimental results in challenging mixed-motive tasks demonstrate faster empirical convergence and higher returns for our method compared with its counterparts. Our code is available at https://github.com/shuhui-zhu/DCL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03866v1-abstract-full').style.display = 'none'; document.getElementById('2503.03866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03206">arXiv:2503.03206</a> <span> [<a href="https://arxiv.org/pdf/2503.03206">pdf</a>, <a href="https://arxiv.org/format/2503.03206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An Analytical Theory of Power Law Spectral Bias in the Learning Dynamics of Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binxu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03206v1-abstract-short" style="display: inline;"> We developed an analytical framework for understanding how the learned distribution evolves during diffusion model training. Leveraging the Gaussian equivalence principle, we derived exact solutions for the gradient-flow dynamics of weights in one- or two-layer linear denoiser settings with arbitrary data. Remarkably, these solutions allowed us to derive the generated distribution in closed form a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03206v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03206v1-abstract-full" style="display: none;"> We developed an analytical framework for understanding how the learned distribution evolves during diffusion model training. Leveraging the Gaussian equivalence principle, we derived exact solutions for the gradient-flow dynamics of weights in one- or two-layer linear denoiser settings with arbitrary data. Remarkably, these solutions allowed us to derive the generated distribution in closed form and its KL divergence through training. These analytical results expose a pronounced power-law spectral bias, i.e., for weights and distributions, the convergence time of a mode follows an inverse power law of its variance. Empirical experiments on both Gaussian and image datasets demonstrate that the power-law spectral bias remains robust even when using deeper or convolutional architectures. Our results underscore the importance of the data covariance in dictating the order and rate at which diffusion models learn different modes of the data, providing potential explanations for why earlier stopping could lead to incorrect details in image generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03206v1-abstract-full').style.display = 'none'; document.getElementById('2503.03206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">50 pages, 10 figures. Preprint</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 60G15 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; G.1.2; G.3; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02875">arXiv:2503.02875</a> <span> [<a href="https://arxiv.org/pdf/2503.02875">pdf</a>, <a href="https://arxiv.org/format/2503.02875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.13140/RG.2.2.33772.07043">10.13140/RG.2.2.33772.07043 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The First Few Tokens Are All You Need: An Efficient and Effective Unsupervised Prefix Fine-Tuning Method for Reasoning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+K">Ke Ji</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahao Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+T">Tian Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiuzhi Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiwei He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhijie Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junying Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+H">Haitao Mi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02875v1-abstract-short" style="display: inline;"> Improving the reasoning capabilities of large language models (LLMs) typically requires supervised fine-tuning with labeled data or computationally expensive sampling. We introduce Unsupervised Prefix Fine-Tuning (UPFT), which leverages the observation of Prefix Self-Consistency -- the shared initial reasoning steps across diverse solution trajectories -- to enhance LLM reasoning efficiency. By tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02875v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02875v1-abstract-full" style="display: none;"> Improving the reasoning capabilities of large language models (LLMs) typically requires supervised fine-tuning with labeled data or computationally expensive sampling. We introduce Unsupervised Prefix Fine-Tuning (UPFT), which leverages the observation of Prefix Self-Consistency -- the shared initial reasoning steps across diverse solution trajectories -- to enhance LLM reasoning efficiency. By training exclusively on the initial prefix substrings (as few as 8 tokens), UPFT removes the need for labeled data or exhaustive sampling. Experiments on reasoning benchmarks show that UPFT matches the performance of supervised methods such as Rejection Sampling Fine-Tuning, while reducing training time by 75% and sampling cost by 99%. Further analysis reveals that errors tend to appear in later stages of the reasoning process and that prefix-based training preserves the model's structural knowledge. This work demonstrates how minimal unsupervised fine-tuning can unlock substantial reasoning gains in LLMs, offering a scalable and resource-efficient alternative to conventional approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02875v1-abstract-full').style.display = 'none'; document.getElementById('2503.02875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02662">arXiv:2503.02662</a> <span> [<a href="https://arxiv.org/pdf/2503.02662">pdf</a>, <a href="https://arxiv.org/format/2503.02662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 10K is Enough: An Ultra-Lightweight Binarized Network for Infrared Small-Target Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+B">Biqiao Xin</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Q">Qianchen Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingshu Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiangbin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+L+P">C. L. Philip Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02662v2-abstract-short" style="display: inline;"> The widespread deployment of Infrared Small-Target Detection (IRSTD) algorithms on edge devices necessitates the exploration of model compression techniques. Binarized neural networks (BNNs) are distinguished by their exceptional efficiency in model compression. However, the small size of infrared targets introduces stringent precision requirements for the IRSTD task, while the inherent precision… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02662v2-abstract-full').style.display = 'inline'; document.getElementById('2503.02662v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02662v2-abstract-full" style="display: none;"> The widespread deployment of Infrared Small-Target Detection (IRSTD) algorithms on edge devices necessitates the exploration of model compression techniques. Binarized neural networks (BNNs) are distinguished by their exceptional efficiency in model compression. However, the small size of infrared targets introduces stringent precision requirements for the IRSTD task, while the inherent precision loss during binarization presents a significant challenge. To address this, we propose the Binarized Infrared Small-Target Detection Network (BiisNet), which preserves the core operations of binarized convolutions while integrating full-precision features into the network's information flow. Specifically, we propose the Dot Binary Convolution, which retains fine-grained semantic information in feature maps while still leveraging the binarized convolution operations. In addition, we introduce a smooth and adaptive Dynamic Softsign function, which provides more comprehensive and progressively finer gradient during backpropagation, enhancing model stability and promoting an optimal weight distribution. Experimental results demonstrate that BiisNet not only significantly outperforms other binary architectures but also has strong competitiveness among state-of-the-art full-precision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02662v2-abstract-full').style.display = 'none'; document.getElementById('2503.02662v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01424">arXiv:2503.01424</a> <span> [<a href="https://arxiv.org/pdf/2503.01424">pdf</a>, <a href="https://arxiv.org/format/2503.01424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zekun Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiaocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lei Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiachong Feng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Ziyun Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruihan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liang Zhao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weitao Ma</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuxuan Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dayong Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guoping Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Bing Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01424v1-abstract-short" style="display: inline;"> Research is a fundamental process driving the advancement of human civilization, yet it demands substantial time and effort from researchers. In recent years, the rapid development of artificial intelligence (AI) technologies has inspired researchers to explore how AI can accelerate and enhance research. To monitor relevant advancements, this paper presents a systematic review of the progress in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01424v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01424v1-abstract-full" style="display: none;"> Research is a fundamental process driving the advancement of human civilization, yet it demands substantial time and effort from researchers. In recent years, the rapid development of artificial intelligence (AI) technologies has inspired researchers to explore how AI can accelerate and enhance research. To monitor relevant advancements, this paper presents a systematic review of the progress in this domain. Specifically, we organize the relevant studies into three main categories: hypothesis formulation, hypothesis validation, and manuscript publication. Hypothesis formulation involves knowledge synthesis and hypothesis generation. Hypothesis validation includes the verification of scientific claims, theorem proving, and experiment validation. Manuscript publication encompasses manuscript writing and the peer review process. Furthermore, we identify and discuss the current challenges faced in these areas, as well as potential future directions for research. Finally, we also offer a comprehensive overview of existing benchmarks and tools across various domains that support the integration of AI into the research process. We hope this paper serves as an introduction for beginners and fosters future research. Resources have been made publicly available at https://github.com/zkzhou126/AI-for-Research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01424v1-abstract-full').style.display = 'none'; document.getElementById('2503.01424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01253">arXiv:2503.01253</a> <span> [<a href="https://arxiv.org/pdf/2503.01253">pdf</a>, <a href="https://arxiv.org/format/2503.01253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> NM-SpMM: Accelerating Matrix Multiplication Using N:M Sparsity with GPGPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+C">Cong Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Du Wu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhelang Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiang Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+J">Jintao Meng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenxi Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A+C">Amelie Chi Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+M">Minwen Deng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanjie Wei</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shengzhong Feng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yi Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01253v2-abstract-short" style="display: inline;"> Deep learning demonstrates effectiveness across a wide range of tasks. However, the dense and over-parameterized nature of these models results in significant resource consumption during deployment. In response to this issue, weight pruning, particularly through N:M sparsity matrix multiplication, offers an efficient solution by transforming dense operations into semi-sparse ones. N:M sparsity pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01253v2-abstract-full').style.display = 'inline'; document.getElementById('2503.01253v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01253v2-abstract-full" style="display: none;"> Deep learning demonstrates effectiveness across a wide range of tasks. However, the dense and over-parameterized nature of these models results in significant resource consumption during deployment. In response to this issue, weight pruning, particularly through N:M sparsity matrix multiplication, offers an efficient solution by transforming dense operations into semi-sparse ones. N:M sparsity provides an option for balancing performance and model accuracy, but introduces more complex programming and optimization challenges. To address these issues, we design a systematic top-down performance analysis model for N:M sparsity. Meanwhile, NM-SpMM is proposed as an efficient general N:M sparsity implementation. Based on our performance analysis, NM-SpMM employs a hierarchical blocking mechanism as a general optimization to enhance data locality, while memory access optimization and pipeline design are introduced as sparsity-aware optimization, allowing it to achieve close-to-theoretical peak performance across different sparsity levels. Experimental results show that NM-SpMM is 2.1x faster than nmSPARSE (the state-of-the-art for general N:M sparsity) and 1.4x to 6.3x faster than cuBLAS's dense GEMM operations, closely approaching the theoretical maximum speedup resulting from the reduction in computation due to sparsity. NM-SpMM is open source and publicly available at https://github.com/M-H482/NM-SpMM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01253v2-abstract-full').style.display = 'none'; document.getElementById('2503.01253v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, accepted at IPDPS 2025. Code: https://github.com/M-H482/NM-SpMM</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.1.4; D.1.3; G.1.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01151">arXiv:2503.01151</a> <span> [<a href="https://arxiv.org/pdf/2503.01151">pdf</a>, <a href="https://arxiv.org/format/2503.01151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ReaderLM-v2: Small Language Model for HTML to Markdown and JSON </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zesheng Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Han Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01151v1-abstract-short" style="display: inline;"> We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding large language models. The model's effectiveness results from two key innovations: (1) a three-stage data synthes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01151v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01151v1-abstract-full" style="display: none;"> We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding large language models. The model's effectiveness results from two key innovations: (1) a three-stage data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, refining, and critiquing web content extraction; and (2) a unified training framework combining continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20\% on carefully curated benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly lower computational requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01151v1-abstract-full').style.display = 'none'; document.getElementById('2503.01151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 10-12 refs</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01143">arXiv:2503.01143</a> <span> [<a href="https://arxiv.org/pdf/2503.01143">pdf</a>, <a href="https://arxiv.org/format/2503.01143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DPR: Diffusion Preference-based Reward for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+T">Teng Pang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingzheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guoqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yilong Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01143v1-abstract-short" style="display: inline;"> Offline preference-based reinforcement learning (PbRL) mitigates the need for reward definition, aligning with human preferences via preference-driven reward feedback without interacting with the environment. However, the effectiveness of preference-driven reward functions depends on the modeling ability of the learning model, which current MLP-based and Transformer-based methods may fail to adequ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01143v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01143v1-abstract-full" style="display: none;"> Offline preference-based reinforcement learning (PbRL) mitigates the need for reward definition, aligning with human preferences via preference-driven reward feedback without interacting with the environment. However, the effectiveness of preference-driven reward functions depends on the modeling ability of the learning model, which current MLP-based and Transformer-based methods may fail to adequately provide. To alleviate the failure of the reward function caused by insufficient modeling, we propose a novel preference-based reward acquisition method: Diffusion Preference-based Reward (DPR). Unlike previous methods using Bradley-Terry models for trajectory preferences, we use diffusion models to directly model preference distributions for state-action pairs, allowing rewards to be discriminatively obtained from these distributions. In addition, considering the particularity of preference data that only know the internal relationships of paired trajectories, we further propose Conditional Diffusion Preference-based Reward (C-DPR), which leverages relative preference information to enhance the construction of the diffusion model. We apply the above methods to existing offline reinforcement learning algorithms and a series of experiment results demonstrate that the diffusion-based reward acquisition approach outperforms previous MLP-based and Transformer-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01143v1-abstract-full').style.display = 'none'; document.getElementById('2503.01143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00162">arXiv:2503.00162</a> <span> [<a href="https://arxiv.org/pdf/2503.00162">pdf</a>, <a href="https://arxiv.org/format/2503.00162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> PreMind: Multi-Agent Video Understanding for Advanced Indexing of Presentation-style Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kangda Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhengyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingqing Wang</a>, <a href="/search/cs?searchtype=author&query=Araki%2C+J">Jun Araki</a>, <a href="/search/cs?searchtype=author&query=Lange%2C+L">Lukas Lange</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ruihong Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhe Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00162v1-abstract-short" style="display: inline;"> In recent years, online lecture videos have become an increasingly popular resource for acquiring new knowledge. Systems capable of effectively understanding/indexing lecture videos are thus highly desirable, enabling downstream tasks like question answering to help users efficiently locate specific information within videos. This work proposes PreMind, a novel multi-agent multimodal framework tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00162v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00162v1-abstract-full" style="display: none;"> In recent years, online lecture videos have become an increasingly popular resource for acquiring new knowledge. Systems capable of effectively understanding/indexing lecture videos are thus highly desirable, enabling downstream tasks like question answering to help users efficiently locate specific information within videos. This work proposes PreMind, a novel multi-agent multimodal framework that leverages various large models for advanced understanding/indexing of presentation-style videos. PreMind first segments videos into slide-presentation segments using a Vision-Language Model (VLM) to enhance modern shot-detection techniques. Each segment is then analyzed to generate multimodal indexes through three key steps: (1) extracting slide visual content, (2) transcribing speech narratives, and (3) consolidating these visual and speech contents into an integrated understanding. Three innovative mechanisms are also proposed to improve performance: leveraging prior lecture knowledge to refine visual understanding, detecting/correcting speech transcription errors using a VLM, and utilizing a critic agent for dynamic iterative self-reflection in vision analysis. Compared to traditional video indexing methods, PreMind captures rich, reliable multimodal information, allowing users to search for details like abbreviations shown only on slides. Systematic evaluations on the public LPM dataset and an internal enterprise dataset are conducted to validate PreMind's effectiveness, supported by detailed analyses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00162v1-abstract-full').style.display = 'none'; document.getElementById('2503.00162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20757">arXiv:2502.20757</a> <span> [<a href="https://arxiv.org/pdf/2502.20757">pdf</a>, <a href="https://arxiv.org/format/2502.20757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Rise of Darkness: Safety-Utility Trade-Offs in Role-Playing Dialogue Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yihong Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehai Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xuefeng Bai</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Z">Zhengyu Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20757v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made remarkable advances in role-playing dialogue agents, demonstrating their utility in character simulations. However, it remains challenging for these agents to balance character portrayal utility with content safety because this essential character simulation often comes with the risk of generating unsafe content. To address this issue, we first conduct a syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20757v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20757v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have made remarkable advances in role-playing dialogue agents, demonstrating their utility in character simulations. However, it remains challenging for these agents to balance character portrayal utility with content safety because this essential character simulation often comes with the risk of generating unsafe content. To address this issue, we first conduct a systematic exploration of the safety-utility trade-off across multiple LLMs. Our analysis reveals that risk scenarios created by villain characters and user queries (referred to as risk coupling) contribute to this trade-off. Building on this, we propose a novel Adaptive Dynamic Multi-Preference (ADMP) method, which dynamically adjusts safety-utility preferences based on the degree of risk coupling and guides the model to generate responses biased toward utility or safety. We further introduce Coupling Margin Sampling (CMS) into coupling detection to enhance the model's ability to handle high-risk scenarios. Experimental results demonstrate that our approach improves safety metrics while maintaining utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20757v1-abstract-full').style.display = 'none'; document.getElementById('2502.20757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20687">arXiv:2502.20687</a> <span> [<a href="https://arxiv.org/pdf/2502.20687">pdf</a>, <a href="https://arxiv.org/format/2502.20687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Two-Tower Models: Diffusion-Based Cross-Interaction for Large-Scale Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yihan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Fei Xiong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhexin Han</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qi Song</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+K">Kaiqiao Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Ben Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20687v1-abstract-short" style="display: inline;"> Two-tower models are widely adopted in the industrial-scale matching stage across a broad range of application domains, such as content recommendations, advertisement systems, and search engines. This model efficiently handles large-scale candidate item screening by separating user and item representations. However, the decoupling network also leads to a neglect of potential information interactio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20687v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20687v1-abstract-full" style="display: none;"> Two-tower models are widely adopted in the industrial-scale matching stage across a broad range of application domains, such as content recommendations, advertisement systems, and search engines. This model efficiently handles large-scale candidate item screening by separating user and item representations. However, the decoupling network also leads to a neglect of potential information interaction between the user and item representations. Current state-of-the-art (SOTA) approaches include adding a shallow fully connected layer(i.e., COLD), which is limited by performance and can only be used in the ranking stage. For performance considerations, another approach attempts to capture historical positive interaction information from the other tower by regarding them as the input features(i.e., DAT). Later research showed that the gains achieved by this method are still limited because of lacking the guidance on the next user intent. To address the aforementioned challenges, we propose a "cross-interaction decoupling architecture" within our matching paradigm. This user-tower architecture leverages a diffusion module to reconstruct the next positive intention representation and employs a mixed-attention module to facilitate comprehensive cross-interaction. During the next positive intention generation, we further enhance the accuracy of its reconstruction by explicitly extracting the temporal drift within user behavior sequences. Experiments on two real-world datasets and one industrial dataset demonstrate that our method outperforms the SOTA two-tower models significantly, and our diffusion approach outperforms other generative models in reconstructing item representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20687v1-abstract-full').style.display = 'none'; document.getElementById('2502.20687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20635">arXiv:2502.20635</a> <span> [<a href="https://arxiv.org/pdf/2502.20635">pdf</a>, <a href="https://arxiv.org/format/2502.20635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can LLM Assist in the Evaluation of the Quality of Machine Learning Explanations? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiqiao Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianlong Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20635v1-abstract-short" style="display: inline;"> EXplainable machine learning (XML) has recently emerged to address the mystery mechanisms of machine learning (ML) systems by interpreting their 'black box' results. Despite the development of various explanation methods, determining the most suitable XML method for specific ML contexts remains unclear, highlighting the need for effective evaluation of explanations. The evaluating capabilities of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20635v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20635v1-abstract-full" style="display: none;"> EXplainable machine learning (XML) has recently emerged to address the mystery mechanisms of machine learning (ML) systems by interpreting their 'black box' results. Despite the development of various explanation methods, determining the most suitable XML method for specific ML contexts remains unclear, highlighting the need for effective evaluation of explanations. The evaluating capabilities of the Transformer-based large language model (LLM) present an opportunity to adopt LLM-as-a-Judge for assessing explanations. In this paper, we propose a workflow that integrates both LLM-based and human judges for evaluating explanations. We examine how LLM-based judges evaluate the quality of various explanation methods and compare their evaluation capabilities to those of human judges within an iris classification scenario, employing both subjective and objective metrics. We conclude that while LLM-based judges effectively assess the quality of explanations using subjective metrics, they are not yet sufficiently developed to replace human judges in this role. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20635v1-abstract-full').style.display = 'none'; document.getElementById('2502.20635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20612">arXiv:2502.20612</a> <span> [<a href="https://arxiv.org/pdf/2502.20612">pdf</a>, <a href="https://arxiv.org/format/2502.20612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Discovering Global False Negatives On the Fly for Self-supervised Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Balmaseda%2C+V">Vicente Balmaseda</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bokun Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Ching-Long Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20612v1-abstract-short" style="display: inline;"> In self-supervised contrastive learning, negative pairs are typically constructed using an anchor image and a sample drawn from the entire dataset, excluding the anchor. However, this approach can result in the creation of negative pairs with similar semantics, referred to as "false negatives", leading to their embeddings being falsely pushed apart. To address this issue, we introduce GloFND, an o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20612v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20612v1-abstract-full" style="display: none;"> In self-supervised contrastive learning, negative pairs are typically constructed using an anchor image and a sample drawn from the entire dataset, excluding the anchor. However, this approach can result in the creation of negative pairs with similar semantics, referred to as "false negatives", leading to their embeddings being falsely pushed apart. To address this issue, we introduce GloFND, an optimization-based approach that automatically learns on the fly the threshold for each anchor data to identify its false negatives during training. In contrast to previous methods for false negative discovery, our approach globally detects false negatives across the entire dataset rather than locally within the mini-batch. Moreover, its per-iteration computation cost remains independent of the dataset size. Experimental results on image and image-text data demonstrate the effectiveness of the proposed method. Our implementation is available at https://github.com/vibalcam/GloFND . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20612v1-abstract-full').style.display = 'none'; document.getElementById('2502.20612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20497">arXiv:2502.20497</a> <span> [<a href="https://arxiv.org/pdf/2502.20497">pdf</a>, <a href="https://arxiv.org/format/2502.20497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Creator-Side Recommender System: Challenges, Designs, and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoshuang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yibo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Husheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+K">Kaiqiao Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Ben Wang</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20497v1-abstract-short" style="display: inline;"> Users and creators are two crucial components of recommender systems. Typical recommender systems focus on the user side, providing the most suitable items based on each user's request. In such scenarios, a few items receive a majority of exposures, while many items receive very few. This imbalance leads to poorer experiences and decreased activity among the creators receiving less feedback, harmi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20497v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20497v1-abstract-full" style="display: none;"> Users and creators are two crucial components of recommender systems. Typical recommender systems focus on the user side, providing the most suitable items based on each user's request. In such scenarios, a few items receive a majority of exposures, while many items receive very few. This imbalance leads to poorer experiences and decreased activity among the creators receiving less feedback, harming the recommender system in the long term. To this end, we develop a creator-side recommender system, called DualRec, to answer the following question: how to find the most suitable users for each item to enhance the creators' experience? We show that typical user-side recommendation algorithms, such as retrieval and ranking algorithms, can be adapted into the creator-side versions with just a few modifications. This greatly simplifies algorithm design in DualRec. Moreover, we discuss a unique challenge in DualRec: the user availability issue, which is not present in user-side recommender systems. To tackle this issue, we incorporate a user availability calculation (UAC) module to effectively enhance DualRec's performance. DualRec has already been implemented in Kwai, a short video recommendation system with over 100 millions user and over 10 million creators, significantly improving the experience for creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20497v1-abstract-full').style.display = 'none'; document.getElementById('2502.20497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages and 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19834">arXiv:2502.19834</a> <span> [<a href="https://arxiv.org/pdf/2502.19834">pdf</a>, <a href="https://arxiv.org/format/2502.19834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Bridger: Towards Training-free Missing Multi-modality Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ke%2C+G">Guanzhou Ke</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengfeng He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+L">Xiao Li Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+G">Guoqing Chao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yi Xie</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">HeXing Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19834v3-abstract-short" style="display: inline;"> Previous successful approaches to missing modality completion rely on carefully designed fusion techniques and extensive pre-training on complete data, which can limit their generalizability in out-of-domain (OOD) scenarios. In this study, we pose a new challenge: can we develop a missing modality completion model that is both resource-efficient and robust to OOD generalization? To address this, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19834v3-abstract-full').style.display = 'inline'; document.getElementById('2502.19834v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19834v3-abstract-full" style="display: none;"> Previous successful approaches to missing modality completion rely on carefully designed fusion techniques and extensive pre-training on complete data, which can limit their generalizability in out-of-domain (OOD) scenarios. In this study, we pose a new challenge: can we develop a missing modality completion model that is both resource-efficient and robust to OOD generalization? To address this, we present a training-free framework for missing modality completion that leverages large multimodal models (LMMs). Our approach, termed the "Knowledge Bridger", is modality-agnostic and integrates generation and ranking of missing modalities. By defining domain-specific priors, our method automatically extracts structured information from available modalities to construct knowledge graphs. These extracted graphs connect the missing modality generation and ranking modules through the LMM, resulting in high-quality imputations of missing modalities. Experimental results across both general and medical domains show that our approach consistently outperforms competing methods, including in OOD generalization. Additionally, our knowledge-driven generation and ranking techniques demonstrate superiority over variants that directly employ LMMs for generation and ranking, offering insights that may be valuable for applications in other domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19834v3-abstract-full').style.display = 'none'; document.getElementById('2502.19834v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19269">arXiv:2502.19269</a> <span> [<a href="https://arxiv.org/pdf/2502.19269">pdf</a>, <a href="https://arxiv.org/format/2502.19269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Antidote: Class-Wise Prompt Tuning for Purifying Backdoors in Pre-trained Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+J">Jiawei Kong</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Sihang Guo</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+C">Chenxi Qing</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19269v1-abstract-short" style="display: inline;"> While pre-trained Vision-Language Models (VLMs) such as CLIP exhibit excellent representational capabilities for multimodal data, recent studies have shown that they are vulnerable to backdoor attacks. To alleviate the threat, existing defense strategies primarily focus on fine-tuning the entire suspicious model, yet offer only marginal resistance to state-of-the-art attacks and often result in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19269v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19269v1-abstract-full" style="display: none;"> While pre-trained Vision-Language Models (VLMs) such as CLIP exhibit excellent representational capabilities for multimodal data, recent studies have shown that they are vulnerable to backdoor attacks. To alleviate the threat, existing defense strategies primarily focus on fine-tuning the entire suspicious model, yet offer only marginal resistance to state-of-the-art attacks and often result in a decrease in clean accuracy, particularly in data-limited scenarios. Their failure may be attributed to the mismatch between insufficient fine-tuning data and massive parameters in VLMs. To address this challenge, we propose Class-wise Backdoor Prompt Tuning (CBPT) defense, an efficient and effective method that operates on the text prompts to indirectly purify the poisoned VLMs. Specifically, we first employ the advanced contrastive learning via our carefully crafted positive and negative samples, to effectively invert the backdoor triggers that are potentially adopted by the attacker. Once the dummy trigger is established, we utilize the efficient prompt tuning technique to optimize these class-wise text prompts for modifying the model's decision boundary to further reclassify the feature regions of backdoor triggers. Extensive experiments demonstrate that CBPT significantly mitigates backdoor threats while preserving model utility, e.g. an average Clean Accuracy (CA) of 58.86\% and an Attack Success Rate (ASR) of 0.39\% across seven mainstream backdoor attacks. These results underscore the superiority of our prompt purifying design to strengthen model robustness against backdoor attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19269v1-abstract-full').style.display = 'none'; document.getElementById('2502.19269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18652">arXiv:2502.18652</a> <span> [<a href="https://arxiv.org/pdf/2502.18652">pdf</a>, <a href="https://arxiv.org/format/2502.18652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Independent Mobility GPT (IDM-GPT): A Self-Supervised Multi-Agent Large Language Model Framework for Customized Traffic Mobility Analysis Using Machine Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fengze Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X+C">Xiaoyue Cathy Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lingjiu Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingzhang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C+D">Chenxi Dylan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18652v2-abstract-short" style="display: inline;"> With the urbanization process, an increasing number of sensors are being deployed in transportation systems, leading to an explosion of big data. To harness the power of this vast transportation data, various machine learning (ML) and artificial intelligence (AI) methods have been introduced to address numerous transportation challenges. However, these methods often require significant investment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18652v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18652v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18652v2-abstract-full" style="display: none;"> With the urbanization process, an increasing number of sensors are being deployed in transportation systems, leading to an explosion of big data. To harness the power of this vast transportation data, various machine learning (ML) and artificial intelligence (AI) methods have been introduced to address numerous transportation challenges. However, these methods often require significant investment in data collection, processing, storage, and the employment of professionals with expertise in transportation and ML. Additionally, privacy issues are a major concern when processing data for real-world traffic control and management. To address these challenges, the research team proposes an innovative Multi-agent framework named Independent Mobility GPT (IDM-GPT) based on large language models (LLMs) for customized traffic analysis, management suggestions, and privacy preservation. IDM-GPT efficiently connects users, transportation databases, and ML models economically. IDM-GPT trains, customizes, and applies various LLM-based AI agents for multiple functions, including user query comprehension, prompts optimization, data analysis, model selection, and performance evaluation and enhancement. With IDM-GPT, users without any background in transportation or ML can efficiently and intuitively obtain data analysis and customized suggestions in near real-time based on their questions. Experimental results demonstrate that IDM-GPT delivers satisfactory performance across multiple traffic-related tasks, providing comprehensive and actionable insights that support effective traffic management and urban mobility improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18652v2-abstract-full').style.display = 'none'; document.getElementById('2502.18652v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 4 figures, TRR accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18143">arXiv:2502.18143</a> <span> [<a href="https://arxiv.org/pdf/2502.18143">pdf</a>, <a href="https://arxiv.org/format/2502.18143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LightFC-X: Lightweight Convolutional Tracker for RGB-X Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunfeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18143v1-abstract-short" style="display: inline;"> Despite great progress in multimodal tracking, these trackers remain too heavy and expensive for resource-constrained devices. To alleviate this problem, we propose LightFC-X, a family of lightweight convolutional RGB-X trackers that explores a unified convolutional architecture for lightweight multimodal tracking. Our core idea is to achieve lightweight cross-modal modeling and joint refinement o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18143v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18143v1-abstract-full" style="display: none;"> Despite great progress in multimodal tracking, these trackers remain too heavy and expensive for resource-constrained devices. To alleviate this problem, we propose LightFC-X, a family of lightweight convolutional RGB-X trackers that explores a unified convolutional architecture for lightweight multimodal tracking. Our core idea is to achieve lightweight cross-modal modeling and joint refinement of the multimodal features and the spatiotemporal appearance features of the target. Specifically, we propose a novel efficient cross-attention module (ECAM) and a novel spatiotemporal template aggregation module (STAM). The ECAM achieves lightweight cross-modal interaction of template-search area integrated feature with only 0.08M parameters. The STAM enhances the model's utilization of temporal information through module fine-tuning paradigm. Comprehensive experiments show that our LightFC-X achieves state-of-the-art performance and the optimal balance between parameters, performance, and speed. For example, LightFC-T-ST outperforms CMD by 4.3% and 5.7% in SR and PR on the LasHeR benchmark, which it achieves 2.6x reduction in parameters and 2.7x speedup. It runs in real-time on the CPU at a speed of 22 fps. The code is available at https://github.com/LiYunfengLYF/LightFC-X. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18143v1-abstract-full').style.display = 'none'; document.getElementById('2502.18143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17744">arXiv:2502.17744</a> <span> [<a href="https://arxiv.org/pdf/2502.17744">pdf</a>, <a href="https://arxiv.org/format/2502.17744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Conformal Prediction Under Generalized Covariate Shift with Posterior Drift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baozhen Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingye Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17744v1-abstract-short" style="display: inline;"> In many real applications of statistical learning, collecting sufficiently many training data is often expensive, time-consuming, or even unrealistic. In this case, a transfer learning approach, which aims to leverage knowledge from a related source domain to improve the learning performance in the target domain, is more beneficial. There have been many transfer learning methods developed under va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17744v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17744v1-abstract-full" style="display: none;"> In many real applications of statistical learning, collecting sufficiently many training data is often expensive, time-consuming, or even unrealistic. In this case, a transfer learning approach, which aims to leverage knowledge from a related source domain to improve the learning performance in the target domain, is more beneficial. There have been many transfer learning methods developed under various distributional assumptions. In this article, we study a particular type of classification problem, called conformal prediction, under a new distributional assumption for transfer learning. Classifiers under the conformal prediction framework predict a set of plausible labels instead of one single label for each data instance, affording a more cautious and safer decision. We consider a generalization of the \textit{covariate shift with posterior drift} setting for transfer learning. Under this setting, we propose a weighted conformal classifier that leverages both the source and target samples, with a coverage guarantee in the target domain. Theoretical studies demonstrate favorable asymptotic properties. Numerical studies further illustrate the usefulness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17744v1-abstract-full').style.display = 'none'; document.getElementById('2502.17744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AISTATS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17612">arXiv:2502.17612</a> <span> [<a href="https://arxiv.org/pdf/2502.17612">pdf</a>, <a href="https://arxiv.org/format/2502.17612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Decentralized Swarms Using Rotation Equivariant Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Transue%2C+T">Taos Transue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17612v2-abstract-short" style="display: inline;"> The orchestration of agents to optimize a collective objective without centralized control is challenging yet crucial for applications such as controlling autonomous fleets, and surveillance and reconnaissance using sensor networks. Decentralized controller design has been inspired by self-organization found in nature, with a prominent source of inspiration being flocking; however, decentralized c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17612v2-abstract-full').style.display = 'inline'; document.getElementById('2502.17612v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17612v2-abstract-full" style="display: none;"> The orchestration of agents to optimize a collective objective without centralized control is challenging yet crucial for applications such as controlling autonomous fleets, and surveillance and reconnaissance using sensor networks. Decentralized controller design has been inspired by self-organization found in nature, with a prominent source of inspiration being flocking; however, decentralized controllers struggle to maintain flock cohesion. The graph neural network (GNN) architecture has emerged as an indispensable machine learning tool for developing decentralized controllers capable of maintaining flock cohesion, but they fail to exploit the symmetries present in flocking dynamics, hindering their generalizability. We enforce rotation equivariance and translation invariance symmetries in decentralized flocking GNN controllers and achieve comparable flocking control with 70% less training data and 75% fewer trainable weights than existing GNN controllers without these symmetries enforced. We also show that our symmetry-aware controller generalizes better than existing GNN controllers. Code and animations are available at http://github.com/Utah-Math-Data-Science/Equivariant-Decentralized-Controllers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17612v2-abstract-full').style.display = 'none'; document.getElementById('2502.17612v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">correcting contact information</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05 (Primary); 68Q32 (Secondary); 68T42 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16400">arXiv:2502.16400</a> <span> [<a href="https://arxiv.org/pdf/2502.16400">pdf</a>, <a href="https://arxiv.org/format/2502.16400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Efficient Semantic-aware Encryption for Secure Communications in Intelligent Connected Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bizhu Wang</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+Z">Zhiqiang Bian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaodong Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chen Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16400v1-abstract-short" style="display: inline;"> Semantic communication (SemCom) significantly improves inter-vehicle interactions in intelligent connected vehicles (ICVs) within limited wireless spectrum. However, the open nature of wireless communications introduces eavesdropping risks. To mitigate this, we propose the Efficient Semantic-aware Encryption (ESAE) mechanism, integrating cryptography into SemCom to secure semantic transmission wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16400v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16400v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16400v1-abstract-full" style="display: none;"> Semantic communication (SemCom) significantly improves inter-vehicle interactions in intelligent connected vehicles (ICVs) within limited wireless spectrum. However, the open nature of wireless communications introduces eavesdropping risks. To mitigate this, we propose the Efficient Semantic-aware Encryption (ESAE) mechanism, integrating cryptography into SemCom to secure semantic transmission without complex key management. ESAE leverages semantic reciprocity between source and reconstructed information from past communications to independently generate session keys at both ends, reducing key transmission costs and associated security risks. Additionally, ESAE introduces a semantic-aware key pre-processing method (SA-KP) using the YOLO-v10 model to extract consistent semantics from bit-level diverse yet semantically identical content, ensuring key consistency. Experimental results validate ESAE's effectiveness and feasibility under various wireless conditions, with key performance factors discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16400v1-abstract-full').style.display = 'none'; document.getElementById('2502.16400v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15690">arXiv:2502.15690</a> <span> [<a href="https://arxiv.org/pdf/2502.15690">pdf</a>, <a href="https://arxiv.org/format/2502.15690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Level-Navi Agent: A Framework and benchmark for Chinese Web Search Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chuanrui Hu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shichong Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15690v1-abstract-short" style="display: inline;"> Large language models (LLMs), adopted to understand human language, drive the development of artificial intelligence (AI) web search agents. Compared to traditional search engines, LLM-powered AI search agents are capable of understanding and responding to complex queries with greater depth, enabling more accurate operations and better context recognition. However, little attention and effort has… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15690v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15690v1-abstract-full" style="display: none;"> Large language models (LLMs), adopted to understand human language, drive the development of artificial intelligence (AI) web search agents. Compared to traditional search engines, LLM-powered AI search agents are capable of understanding and responding to complex queries with greater depth, enabling more accurate operations and better context recognition. However, little attention and effort has been paid to the Chinese web search, which results in that the capabilities of open-source models have not been uniformly and fairly evaluated. The difficulty lies in lacking three aspects: an unified agent framework, an accurately labeled dataset, and a suitable evaluation metric. To address these issues, we propose a general-purpose and training-free web search agent by level-aware navigation, Level-Navi Agent, accompanied by a well-annotated dataset (Web24) and a suitable evaluation metric. Level-Navi Agent can think through complex user questions and conduct searches across various levels on the internet to gather information for questions. Meanwhile, we provide a comprehensive evaluation of state-of-the-art LLMs under fair settings. To further facilitate future research, source code is available at Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15690v1-abstract-full').style.display = 'none'; document.getElementById('2502.15690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15688">arXiv:2502.15688</a> <span> [<a href="https://arxiv.org/pdf/2502.15688">pdf</a>, <a href="https://arxiv.org/format/2502.15688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> XPath Agent: An Efficient XPath Programming Agent Based on LLM for Web Crawler </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bryce Wang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+X">Xinyu Luan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15688v1-abstract-short" style="display: inline;"> We present XPath Agent, a production-ready XPath programming agent specifically designed for web crawling and web GUI testing. A key feature of XPath Agent is its ability to automatically generate XPath queries from a set of sampled web pages using a single natural language query. To demonstrate its effectiveness, we benchmark XPath Agent against a state-of-the-art XPath programming agent across a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15688v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15688v1-abstract-full" style="display: none;"> We present XPath Agent, a production-ready XPath programming agent specifically designed for web crawling and web GUI testing. A key feature of XPath Agent is its ability to automatically generate XPath queries from a set of sampled web pages using a single natural language query. To demonstrate its effectiveness, we benchmark XPath Agent against a state-of-the-art XPath programming agent across a range of web crawling tasks. Our results show that XPath Agent achieves comparable performance metrics while significantly reducing token usage and improving clock-time efficiency. The well-designed two-stage pipeline allows for seamless integration into existing web crawling or web GUI testing workflows, thereby saving time and effort in manual XPath query development. The source code for XPath Agent is available at https://github.com/eavae/feilian. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15688v1-abstract-full').style.display = 'none'; document.getElementById('2502.15688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15483">arXiv:2502.15483</a> <span> [<a href="https://arxiv.org/pdf/2502.15483">pdf</a>, <a href="https://arxiv.org/format/2502.15483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> MoMa: A Modular Deep Learning Framework for Material Property Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Botian Wang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Y">Yawen Ouyang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaohui Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiqun Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Haorui Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianbing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaonan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei-Ying Ma</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15483v1-abstract-short" style="display: inline;"> Deep learning methods for material property prediction have been widely explored to advance materials discovery. However, the prevailing pre-train then fine-tune paradigm often fails to address the inherent diversity and disparity of material tasks. To overcome these challenges, we introduce MoMa, a Modular framework for Materials that first trains specialized modules across a wide range of tasks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15483v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15483v1-abstract-full" style="display: none;"> Deep learning methods for material property prediction have been widely explored to advance materials discovery. However, the prevailing pre-train then fine-tune paradigm often fails to address the inherent diversity and disparity of material tasks. To overcome these challenges, we introduce MoMa, a Modular framework for Materials that first trains specialized modules across a wide range of tasks and then adaptively composes synergistic modules tailored to each downstream scenario. Evaluation across 17 datasets demonstrates the superiority of MoMa, with a substantial 14% average improvement over the strongest baseline. Few-shot and continual learning experiments further highlight MoMa's potential for real-world applications. Pioneering a new paradigm of modular material learning, MoMa will be open-sourced to foster broader community collaboration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15483v1-abstract-full').style.display = 'none'; document.getElementById('2502.15483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15153">arXiv:2502.15153</a> <span> [<a href="https://arxiv.org/pdf/2502.15153">pdf</a>, <a href="https://arxiv.org/format/2502.15153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Investigating the Adaptive Robustness with Knowledge Conflicts in LLM-based Multi-Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+T">Tianjie Ju</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bowen Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Mong-Li Lee</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W">Wynne Hsu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianren Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Pengzhou Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongru Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gongshen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15153v1-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have upgraded them from sophisticated text generators to autonomous agents capable of corporation and tool use in multi-agent systems (MASs). However, the robustness of these LLM-based MASs, especially under knowledge conflicts, remains unclear. In this paper, we design four comprehensive metrics to investigate the robustness of MASs when facing mild… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15153v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15153v1-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have upgraded them from sophisticated text generators to autonomous agents capable of corporation and tool use in multi-agent systems (MASs). However, the robustness of these LLM-based MASs, especially under knowledge conflicts, remains unclear. In this paper, we design four comprehensive metrics to investigate the robustness of MASs when facing mild or task-critical knowledge conflicts. We first analyze mild knowledge conflicts introduced by heterogeneous agents and find that they do not harm system robustness but instead improve collaborative decision-making. Next, we investigate task-critical knowledge conflicts by synthesizing knowledge conflicts and embedding them into one of the agents. Our results show that these conflicts have surprisingly little to no impact on MAS robustness. Furthermore, we observe that MASs demonstrate certain self-repairing capabilities by reducing their reliance on knowledge conflicts and adopting alternative solution paths to maintain stability. Finally, we conduct ablation studies on the knowledge conflict number, agent number, and interaction rounds, finding that the self-repairing capability of MASs has intrinsic limits, and all findings hold consistently across various factors. Our code is publicly available at https://github.com/wbw625/MultiAgentRobustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15153v1-abstract-full').style.display = 'none'; document.getElementById('2502.15153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15072">arXiv:2502.15072</a> <span> [<a href="https://arxiv.org/pdf/2502.15072">pdf</a>, <a href="https://arxiv.org/format/2502.15072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Econometrics">econ.EM</span> </div> </div> <p class="title is-5 mathjax"> Modifying Final Splits of Classification Tree for Fine-tuning Subpopulation Target in Policy Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L+B">Lei Bill Wang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Z">Zhenbang Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangyi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15072v1-abstract-short" style="display: inline;"> Policymakers often use Classification and Regression Trees (CART) to partition populations based on binary outcomes and target subpopulations whose probability of the binary event exceeds a threshold. However, classic CART and knowledge distillation method whose student model is a CART (referred to as KD-CART) do not minimize the misclassification risk associated with classifying the latent probab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15072v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15072v1-abstract-full" style="display: none;"> Policymakers often use Classification and Regression Trees (CART) to partition populations based on binary outcomes and target subpopulations whose probability of the binary event exceeds a threshold. However, classic CART and knowledge distillation method whose student model is a CART (referred to as KD-CART) do not minimize the misclassification risk associated with classifying the latent probabilities of these binary events. To reduce the misclassification risk, we propose two methods, Penalized Final Split (PFS) and Maximizing Distance Final Split (MDFS). PFS incorporates a tunable penalty into the standard CART splitting criterion function. MDFS maximizes a weighted sum of distances between node means and the threshold. It can point-identify the optimal split under the unique intersect latent probability assumption. In addition, we develop theoretical result for MDFS splitting rule estimation, which has zero asymptotic risk. Through extensive simulation studies, we demonstrate that these methods predominately outperform classic CART and KD-CART in terms of misclassification error. Furthermore, in our empirical evaluations, these methods provide deeper insights than the two baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15072v1-abstract-full').style.display = 'none'; document.getElementById('2502.15072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14822">arXiv:2502.14822</a> <span> [<a href="https://arxiv.org/pdf/2502.14822">pdf</a>, <a href="https://arxiv.org/format/2502.14822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Model Architectures in Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhichao Xu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiqi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Crystina Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Puxuan Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a>, <a href="/search/cs?searchtype=author&query=Srikumar%2C+V">Vivek Srikumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14822v1-abstract-short" style="display: inline;"> This survey examines the evolution of model architectures in information retrieval (IR), focusing on two key aspects: backbone models for feature extraction and end-to-end system architectures for relevance estimation. The review intentionally separates architectural considerations from training methodologies to provide a focused analysis of structural innovations in IR systems.We trace the develo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14822v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14822v1-abstract-full" style="display: none;"> This survey examines the evolution of model architectures in information retrieval (IR), focusing on two key aspects: backbone models for feature extraction and end-to-end system architectures for relevance estimation. The review intentionally separates architectural considerations from training methodologies to provide a focused analysis of structural innovations in IR systems.We trace the development from traditional term-based methods to modern neural approaches, particularly highlighting the impact of transformer-based models and subsequent large language models (LLMs). We conclude by discussing emerging challenges and future directions, including architectural optimizations for performance and scalability, handling of multimodal, multilingual data, and adaptation to novel application domains beyond traditional search paradigms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14822v1-abstract-full').style.display = 'none'; document.getElementById('2502.14822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14739">arXiv:2502.14739</a> <span> [<a href="https://arxiv.org/pdf/2502.14739">pdf</a>, <a href="https://arxiv.org/format/2502.14739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SuperGPQA: Scaling LLM Evaluation across 285 Graduate Disciplines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Team%2C+M">M-A-P Team</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifan Yao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaijing Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingli Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaolong Jin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenlin Wei</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chujie Zheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kaixin Deng</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shian Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sichao Jiang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiyan Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinrui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sirun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunwen Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Dehua Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuansheng Ni</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiyao Wang</a> , et al. (71 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14739v3-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-orient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v3-abstract-full').style.display = 'inline'; document.getElementById('2502.14739v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14739v3-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-oriented disciplines-remain inadequately evaluated. To address this gap, we present SuperGPQA, a comprehensive benchmark that evaluates graduate-level knowledge and reasoning capabilities across 285 disciplines. Our benchmark employs a novel Human-LLM collaborative filtering mechanism to eliminate trivial or ambiguous questions through iterative refinement based on both LLM responses and expert feedback. Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence. Additionally, we present comprehensive insights from our management of a large-scale annotation process, involving over 80 expert annotators and an interactive Human-LLM collaborative system, offering valuable methodological guidance for future research initiatives of comparable scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v3-abstract-full').style.display = 'none'; document.getElementById('2502.14739v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14192">arXiv:2502.14192</a> <span> [<a href="https://arxiv.org/pdf/2502.14192">pdf</a>, <a href="https://arxiv.org/format/2502.14192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> NLP-AKG: Few-Shot Construction of NLP Academic Knowledge Graph Based on LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jiayin Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dayong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shijin Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Bing Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14192v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been widely applied in question answering over scientific research papers. To enhance the professionalism and accuracy of responses, many studies employ external knowledge augmentation. However, existing structures of external knowledge in scientific literature often focus solely on either paper entities or domain concepts, neglecting the intrinsic connections bet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14192v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14192v1-abstract-full" style="display: none;"> Large language models (LLMs) have been widely applied in question answering over scientific research papers. To enhance the professionalism and accuracy of responses, many studies employ external knowledge augmentation. However, existing structures of external knowledge in scientific literature often focus solely on either paper entities or domain concepts, neglecting the intrinsic connections between papers through shared domain concepts. This results in less comprehensive and specific answers when addressing questions that combine papers and concepts. To address this, we propose a novel knowledge graph framework that captures deep conceptual relations between academic papers, constructing a relational network via intra-paper semantic elements and inter-paper citation relations. Using a few-shot knowledge graph construction method based on LLM, we develop NLP-AKG, an academic knowledge graph for the NLP domain, by extracting 620,353 entities and 2,271,584 relations from 60,826 papers in ACL Anthology. Based on this, we propose a 'sub-graph community summary' method and validate its effectiveness on three NLP scientific literature question answering datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14192v1-abstract-full').style.display = 'none'; document.getElementById('2502.14192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14022">arXiv:2502.14022</a> <span> [<a href="https://arxiv.org/pdf/2502.14022">pdf</a>, <a href="https://arxiv.org/format/2502.14022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> A General Framework for Augmenting Lossy Compressors with Topological Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gorski%2C+N">Nathaniel Gorski</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xin Liang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanqi Guo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Lin Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14022v1-abstract-short" style="display: inline;"> Topological descriptors such as contour trees are widely utilized in scientific data analysis and visualization, with applications from materials science to climate simulations. It is desirable to preserve topological descriptors when data compression is part of the scientific workflow for these applications. However, classic error-bounded lossy compressors for volumetric data do not guarantee the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14022v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14022v1-abstract-full" style="display: none;"> Topological descriptors such as contour trees are widely utilized in scientific data analysis and visualization, with applications from materials science to climate simulations. It is desirable to preserve topological descriptors when data compression is part of the scientific workflow for these applications. However, classic error-bounded lossy compressors for volumetric data do not guarantee the preservation of topological descriptors, despite imposing strict pointwise error bounds. In this work, we introduce a general framework for augmenting any lossy compressor to preserve the topology of the data during compression. Specifically, our framework quantifies the adjustments (to the decompressed data) needed to preserve the contour tree and then employs a custom variable-precision encoding scheme to store these adjustments. We demonstrate the utility of our framework in augmenting classic compressors (such as SZ3, TTHRESH, and ZFP) and deep learning-based compressors (such as Neurcomp) with topological guarantees. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14022v1-abstract-full').style.display = 'none'; document.getElementById('2502.14022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 19 figures, to be published in IEEE TVCG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13894">arXiv:2502.13894</a> <span> [<a href="https://arxiv.org/pdf/2502.13894">pdf</a>, <a href="https://arxiv.org/format/2502.13894">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NavigateDiff: Visual Predictors are Zero-Shot Navigation Assistants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yiran Qin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+A">Ao Sun</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuze Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruimao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13894v1-abstract-short" style="display: inline;"> Navigating unfamiliar environments presents significant challenges for household robots, requiring the ability to recognize and reason about novel decoration and layout. Existing reinforcement learning methods cannot be directly transferred to new environments, as they typically rely on extensive mapping and exploration, leading to time-consuming and inefficient. To address these challenges, we tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13894v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13894v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13894v1-abstract-full" style="display: none;"> Navigating unfamiliar environments presents significant challenges for household robots, requiring the ability to recognize and reason about novel decoration and layout. Existing reinforcement learning methods cannot be directly transferred to new environments, as they typically rely on extensive mapping and exploration, leading to time-consuming and inefficient. To address these challenges, we try to transfer the logical knowledge and the generalization ability of pre-trained foundation models to zero-shot navigation. By integrating a large vision-language model with a diffusion network, our approach named \mname ~constructs a visual predictor that continuously predicts the agent's potential observations in the next step which can assist robots generate robust actions. Furthermore, to adapt the temporal property of navigation, we introduce temporal historical information to ensure that the predicted image is aligned with the navigation scene. We then carefully designed an information fusion framework that embeds the predicted future frames as guidance into goal-reaching policy to solve downstream image navigation tasks. This approach enhances navigation control and generalization across both simulated and real-world environments. Through extensive experimentation, we demonstrate the robustness and versatility of our method, showcasing its potential to improve the efficiency and effectiveness of robotic navigation in diverse settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13894v1-abstract-full').style.display = 'none'; document.getElementById('2502.13894v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICRA2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13172">arXiv:2502.13172</a> <span> [<a href="https://arxiv.org/pdf/2502.13172">pdf</a>, <a href="https://arxiv.org/format/2502.13172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Privacy Risks in LLM Agent Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Weiyi He</a>, <a href="/search/cs?searchtype=author&query=He%2C+P">Pengfei He</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shenglai Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Z">Zhen Xiang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yue Xing</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiliang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13172v1-abstract-short" style="display: inline;"> Large Language Model (LLM) agents have become increasingly prevalent across various real-world applications. They enhance decision-making by storing private user-agent interactions in the memory module for demonstrations, introducing new privacy risks for LLM agents. In this work, we systematically investigate the vulnerability of LLM agents to our proposed Memory EXTRaction Attack (MEXTRA) under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13172v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13172v1-abstract-full" style="display: none;"> Large Language Model (LLM) agents have become increasingly prevalent across various real-world applications. They enhance decision-making by storing private user-agent interactions in the memory module for demonstrations, introducing new privacy risks for LLM agents. In this work, we systematically investigate the vulnerability of LLM agents to our proposed Memory EXTRaction Attack (MEXTRA) under a black-box setting. To extract private information from memory, we propose an effective attacking prompt design and an automated prompt generation method based on different levels of knowledge about the LLM agent. Experiments on two representative agents demonstrate the effectiveness of MEXTRA. Moreover, we explore key factors influencing memory leakage from both the agent's and the attacker's perspectives. Our findings highlight the urgent need for effective memory safeguards in LLM agent design and deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13172v1-abstract-full').style.display = 'none'; document.getElementById('2502.13172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>