Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 3,686 results for author: <span class="mathjax">Zhang, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14822">arXiv:2502.14822</a> <span> [<a href="https://arxiv.org/pdf/2502.14822">pdf</a>, <a href="https://arxiv.org/format/2502.14822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Model Architectures in Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhichao Xu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiqi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Crystina Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Puxuan Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a>, <a href="/search/cs?searchtype=author&query=Srikumar%2C+V">Vivek Srikumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14822v1-abstract-short" style="display: inline;"> This survey examines the evolution of model architectures in information retrieval (IR), focusing on two key aspects: backbone models for feature extraction and end-to-end system architectures for relevance estimation. The review intentionally separates architectural considerations from training methodologies to provide a focused analysis of structural innovations in IR systems.We trace the develo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14822v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14822v1-abstract-full" style="display: none;"> This survey examines the evolution of model architectures in information retrieval (IR), focusing on two key aspects: backbone models for feature extraction and end-to-end system architectures for relevance estimation. The review intentionally separates architectural considerations from training methodologies to provide a focused analysis of structural innovations in IR systems.We trace the development from traditional term-based methods to modern neural approaches, particularly highlighting the impact of transformer-based models and subsequent large language models (LLMs). We conclude by discussing emerging challenges and future directions, including architectural optimizations for performance and scalability, handling of multimodal, multilingual data, and adaptation to novel application domains beyond traditional search paradigms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14822v1-abstract-full').style.display = 'none'; document.getElementById('2502.14822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14739">arXiv:2502.14739</a> <span> [<a href="https://arxiv.org/pdf/2502.14739">pdf</a>, <a href="https://arxiv.org/format/2502.14739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SuperGPQA: Scaling LLM Evaluation across 285 Graduate Disciplines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Team%2C+M">M-A-P Team</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifan Yao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaijing Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingli Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaolong Jin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenlin Wei</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chujie Zheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kaixing Deng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shian Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sichao Jiang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiyan Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinrui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sirun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunwen Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Dehua Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuansheng Ni</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a> , et al. (70 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14739v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-orient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14739v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-oriented disciplines-remain inadequately evaluated. To address this gap, we present SuperGPQA, a comprehensive benchmark that evaluates graduate-level knowledge and reasoning capabilities across 285 disciplines. Our benchmark employs a novel Human-LLM collaborative filtering mechanism to eliminate trivial or ambiguous questions through iterative refinement based on both LLM responses and expert feedback. Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence. Additionally, we present comprehensive insights from our management of a large-scale annotation process, involving over 80 expert annotators and an interactive Human-LLM collaborative system, offering valuable methodological guidance for future research initiatives of comparable scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v1-abstract-full').style.display = 'none'; document.getElementById('2502.14739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14616">arXiv:2502.14616</a> <span> [<a href="https://arxiv.org/pdf/2502.14616">pdf</a>, <a href="https://arxiv.org/format/2502.14616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Monocular Depth Estimation and Segmentation for Transparent Object with Iterative Semantic and Geometric Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiangyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hongxuan Ma</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuxin Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+W">Wei Sui</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+W">Wei Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14616v1-abstract-short" style="display: inline;"> Transparent object perception is indispensable for numerous robotic tasks. However, accurately segmenting and estimating the depth of transparent objects remain challenging due to complex optical properties. Existing methods primarily delve into only one task using extra inputs or specialized sensors, neglecting the valuable interactions among tasks and the subsequent refinement process, leading t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14616v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14616v1-abstract-full" style="display: none;"> Transparent object perception is indispensable for numerous robotic tasks. However, accurately segmenting and estimating the depth of transparent objects remain challenging due to complex optical properties. Existing methods primarily delve into only one task using extra inputs or specialized sensors, neglecting the valuable interactions among tasks and the subsequent refinement process, leading to suboptimal and blurry predictions. To address these issues, we propose a monocular framework, which is the first to excel in both segmentation and depth estimation of transparent objects, with only a single-image input. Specifically, we devise a novel semantic and geometric fusion module, effectively integrating the multi-scale information between tasks. In addition, drawing inspiration from human perception of objects, we further incorporate an iterative strategy, which progressively refines initial features for clearer results. Experiments on two challenging synthetic and real-world datasets demonstrate that our model surpasses state-of-the-art monocular, stereo, and multi-view methods by a large margin of about 38.8%-46.2% with only a single RGB input. Codes and models are publicly available at https://github.com/L-J-Yuan/MODEST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14616v1-abstract-full').style.display = 'none'; document.getElementById('2502.14616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA(2025). The code is accessible through: https://github.com/L-J-Yuan/MODEST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14379">arXiv:2502.14379</a> <span> [<a href="https://arxiv.org/pdf/2502.14379">pdf</a>, <a href="https://arxiv.org/format/2502.14379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Achieving adaptivity and optimality for multi-armed bandits using Exponential-Kullback Leiblier Maillard Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+H">Hao Qin</a>, <a href="/search/cs?searchtype=author&query=Jun%2C+K">Kwang-Sung Jun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chicheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14379v1-abstract-short" style="display: inline;"> We study the problem of Multi-Armed Bandits (MAB) with reward distributions belonging to a One-Parameter Exponential Distribution (OPED) family. In the literature, several criteria have been proposed to evaluate the performance of such algorithms, including Asymptotic Optimality (A.O.), Minimax Optimality (M.O.), Sub-UCB, and variance-adaptive worst-case regret bound. Thompson Sampling (TS)-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14379v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14379v1-abstract-full" style="display: none;"> We study the problem of Multi-Armed Bandits (MAB) with reward distributions belonging to a One-Parameter Exponential Distribution (OPED) family. In the literature, several criteria have been proposed to evaluate the performance of such algorithms, including Asymptotic Optimality (A.O.), Minimax Optimality (M.O.), Sub-UCB, and variance-adaptive worst-case regret bound. Thompson Sampling (TS)-based and Upper Confidence Bound (UCB)-based algorithms have been employed to achieve some of these criteria. However, none of these algorithms simultaneously satisfy all the aforementioned criteria. In this paper, we design an algorithm, Exponential Kullback-Leibler Maillard Sampling (abbrev. \expklms), that can achieve multiple optimality criteria simultaneously, including A.O., M.O. with a logarithmic factor, Sub-UCB, and variance-adaptive worst-case regret bound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14379v1-abstract-full').style.display = 'none'; document.getElementById('2502.14379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages of the main body, 2 figures, 43 pages in total</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14205">arXiv:2502.14205</a> <span> [<a href="https://arxiv.org/pdf/2502.14205">pdf</a>, <a href="https://arxiv.org/format/2502.14205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Accurate Forgetting for Heterogeneous Federated Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wuerkaixi%2C+A">Abudukelimu Wuerkaixi</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Sen Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kunda Yan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+G">Gang Niu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lei Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changshui Zhang</a>, <a href="/search/cs?searchtype=author&query=Sugiyama%2C+M">Masashi Sugiyama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14205v1-abstract-short" style="display: inline;"> Recent years have witnessed a burgeoning interest in federated learning (FL). However, the contexts in which clients engage in sequential learning remain under-explored. Bridging FL and continual learning (CL) gives rise to a challenging practical problem: federated continual learning (FCL). Existing research in FCL primarily focuses on mitigating the catastrophic forgetting issue of continual lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14205v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14205v1-abstract-full" style="display: none;"> Recent years have witnessed a burgeoning interest in federated learning (FL). However, the contexts in which clients engage in sequential learning remain under-explored. Bridging FL and continual learning (CL) gives rise to a challenging practical problem: federated continual learning (FCL). Existing research in FCL primarily focuses on mitigating the catastrophic forgetting issue of continual learning while collaborating with other clients. We argue that the forgetting phenomena are not invariably detrimental. In this paper, we consider a more practical and challenging FCL setting characterized by potentially unrelated or even antagonistic data/tasks across different clients. In the FL scenario, statistical heterogeneity and data noise among clients may exhibit spurious correlations which result in biased feature learning. While existing CL strategies focus on a complete utilization of previous knowledge, we found that forgetting biased information is beneficial in our study. Therefore, we propose a new concept accurate forgetting (AF) and develop a novel generative-replay method~\method~which selectively utilizes previous knowledge in federated networks. We employ a probabilistic framework based on a normalizing flow model to quantify the credibility of previous knowledge. Comprehensive experiments affirm the superiority of our method over baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14205v1-abstract-full').style.display = 'none'; document.getElementById('2502.14205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published in ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13943">arXiv:2502.13943</a> <span> [<a href="https://arxiv.org/pdf/2502.13943">pdf</a>, <a href="https://arxiv.org/format/2502.13943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AdaptiveStep: Automatically Dividing Reasoning Step through Model Confidence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuliang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junjie Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoling Chen</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+C">Chaofeng Qu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J+K">Jason Klein Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chonghan Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yunhui Xia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Li Zhao</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wei Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouhan Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13943v1-abstract-short" style="display: inline;"> Current approaches for training Process Reward Models (PRMs) often involve breaking down responses into multiple reasoning steps using rule-based techniques, such as using predefined placeholder tokens or setting the reasoning step's length into a fixed size. These approaches overlook the fact that specific words do not typically mark true decision points in a text. To address this, we propose Ada… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13943v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13943v1-abstract-full" style="display: none;"> Current approaches for training Process Reward Models (PRMs) often involve breaking down responses into multiple reasoning steps using rule-based techniques, such as using predefined placeholder tokens or setting the reasoning step's length into a fixed size. These approaches overlook the fact that specific words do not typically mark true decision points in a text. To address this, we propose AdaptiveStep, a method that divides reasoning steps based on the model's confidence in predicting the next word. This division method provides more decision-making information at each step, enhancing downstream tasks, such as reward model learning. Moreover, our method does not require manual annotation. We demonstrate its effectiveness through experiments with AdaptiveStep-trained PRMs in mathematical reasoning and code generation tasks. Experimental results indicate that the outcome PRM achieves state-of-the-art Best-of-N performance, surpassing greedy search strategy with token-level value-guided decoding, while also reducing construction costs by over 30% compared to existing open-source PRMs. In addition, we provide a thorough analysis and case study on the PRM's performance, transferability, and generalization capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13943v1-abstract-full').style.display = 'none'; document.getElementById('2502.13943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13595">arXiv:2502.13595</a> <span> [<a href="https://arxiv.org/pdf/2502.13595">pdf</a>, <a href="https://arxiv.org/format/2502.13595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MMTEB: Massive Multilingual Text Embedding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Enevoldsen%2C+K">Kenneth Enevoldsen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+I">Isaac Chung</a>, <a href="/search/cs?searchtype=author&query=Kerboua%2C+I">Imene Kerboua</a>, <a href="/search/cs?searchtype=author&query=Kardos%2C+M">M谩rton Kardos</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Ashwin Mathur</a>, <a href="/search/cs?searchtype=author&query=Stap%2C+D">David Stap</a>, <a href="/search/cs?searchtype=author&query=Gala%2C+J">Jay Gala</a>, <a href="/search/cs?searchtype=author&query=Siblini%2C+W">Wissam Siblini</a>, <a href="/search/cs?searchtype=author&query=Krzemi%C5%84ski%2C+D">Dominik Krzemi艅ski</a>, <a href="/search/cs?searchtype=author&query=Winata%2C+G+I">Genta Indra Winata</a>, <a href="/search/cs?searchtype=author&query=Sturua%2C+S">Saba Sturua</a>, <a href="/search/cs?searchtype=author&query=Utpala%2C+S">Saiteja Utpala</a>, <a href="/search/cs?searchtype=author&query=Ciancone%2C+M">Mathieu Ciancone</a>, <a href="/search/cs?searchtype=author&query=Schaeffer%2C+M">Marion Schaeffer</a>, <a href="/search/cs?searchtype=author&query=Sequeira%2C+G">Gabriel Sequeira</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Dhakal%2C+S">Shreeya Dhakal</a>, <a href="/search/cs?searchtype=author&query=Rystr%C3%B8m%2C+J">Jonathan Rystr酶m</a>, <a href="/search/cs?searchtype=author&query=Solomatin%2C+R">Roman Solomatin</a>, <a href="/search/cs?searchtype=author&query=%C3%87a%C4%9Fatan%2C+%C3%96">脰mer 脟a臒atan</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+A">Akash Kundu</a>, <a href="/search/cs?searchtype=author&query=Bernstorff%2C+M">Martin Bernstorff</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Sukhlecha%2C+A">Akshita Sukhlecha</a>, <a href="/search/cs?searchtype=author&query=Pahwa%2C+B">Bhavish Pahwa</a> , et al. (61 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13595v1-abstract-short" style="display: inline;"> Text embeddings are typically evaluated on a limited set of tasks, which are constrained by language, domain, and task diversity. To address these limitations and provide a more comprehensive evaluation, we introduce the Massive Multilingual Text Embedding Benchmark (MMTEB) - a large-scale, community-driven expansion of MTEB, covering over 500 quality-controlled evaluation tasks across 250+ langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13595v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13595v1-abstract-full" style="display: none;"> Text embeddings are typically evaluated on a limited set of tasks, which are constrained by language, domain, and task diversity. To address these limitations and provide a more comprehensive evaluation, we introduce the Massive Multilingual Text Embedding Benchmark (MMTEB) - a large-scale, community-driven expansion of MTEB, covering over 500 quality-controlled evaluation tasks across 250+ languages. MMTEB includes a diverse set of challenging, novel tasks such as instruction following, long-document retrieval, and code retrieval, representing the largest multilingual collection of evaluation tasks for embedding models to date. Using this collection, we develop several highly multilingual benchmarks, which we use to evaluate a representative set of models. We find that while large language models (LLMs) with billions of parameters can achieve state-of-the-art performance on certain language subsets and task categories, the best-performing publicly available model is multilingual-e5-large-instruct with only 560 million parameters. To facilitate accessibility and reduce computational cost, we introduce a novel downsampling method based on inter-task correlation, ensuring a diverse selection while preserving relative model rankings. Furthermore, we optimize tasks such as retrieval by sampling hard negatives, creating smaller but effective splits. These optimizations allow us to introduce benchmarks that drastically reduce computational demands. For instance, our newly introduced zero-shot English benchmark maintains a ranking order similar to the full-scale version but at a fraction of the computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13595v1-abstract-full').style.display = 'none'; document.getElementById('2502.13595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ICLR: https://openreview.net/forum?id=zl3pfz4VCV</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13363">arXiv:2502.13363</a> <span> [<a href="https://arxiv.org/pdf/2502.13363">pdf</a>, <a href="https://arxiv.org/format/2502.13363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pretrained Image-Text Models are Secretly Video Captioners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Jian%2C+Y">Yiren Jian</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhongyu Ouyang</a>, <a href="/search/cs?searchtype=author&query=Vosoughi%2C+S">Soroush Vosoughi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13363v1-abstract-short" style="display: inline;"> Developing video captioning models is computationally expensive. The dynamic nature of video also complicates the design of multimodal models that can effectively caption these sequences. However, we find that by using minimal computational resources and without complex modifications to address video dynamics, an image-based model can be repurposed to outperform several specialised video captionin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13363v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13363v1-abstract-full" style="display: none;"> Developing video captioning models is computationally expensive. The dynamic nature of video also complicates the design of multimodal models that can effectively caption these sequences. However, we find that by using minimal computational resources and without complex modifications to address video dynamics, an image-based model can be repurposed to outperform several specialised video captioning systems. Our adapted model demonstrates top tier performance on major benchmarks, ranking 2nd on MSRVTT and MSVD, and 3rd on VATEX. We transform it into a competitive video captioner by post training a typical image captioning model BLIP2 with only 6,000 video text pairs and simply concatenating frames (significantly fewer data than other methods), which use 2.5 to 144 million pairs. From a resource optimization perspective, this video captioning study focuses on three fundamental factors: optimizing model scale, maximizing data efficiency, and incorporating reinforcement learning. This extensive study demonstrates that a lightweight, image based adaptation strategy can rival state-of-the-art video captioning systems, offering a practical solution for low-resource scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13363v1-abstract-full').style.display = 'none'; document.getElementById('2502.13363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 2025 Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics (NAACL 2025). The first two authors contributed equally and were listed in random order</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13076">arXiv:2502.13076</a> <span> [<a href="https://arxiv.org/pdf/2502.13076">pdf</a>, <a href="https://arxiv.org/format/2502.13076">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KAPPA: A Generic Patent Analysis Framework with Keyphrase-Based Portraits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yujin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+G">Guisheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+L">Linning Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13076v1-abstract-short" style="display: inline;"> Patent analysis highly relies on concise and interpretable document representations, referred to as patent portraits. Keyphrases, both present and absent, are ideal candidates for patent portraits due to their brevity, representativeness, and clarity. In this paper, we introduce KAPPA, an integrated framework designed to construct keyphrase-based patent portraits and enhance patent analysis. KAPPA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13076v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13076v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13076v1-abstract-full" style="display: none;"> Patent analysis highly relies on concise and interpretable document representations, referred to as patent portraits. Keyphrases, both present and absent, are ideal candidates for patent portraits due to their brevity, representativeness, and clarity. In this paper, we introduce KAPPA, an integrated framework designed to construct keyphrase-based patent portraits and enhance patent analysis. KAPPA operates in two phases: patent portrait construction and portrait-based analysis. To ensure effective portrait construction, we propose a semantic-calibrated keyphrase generation paradigm that integrates pre-trained language models with a prompt-based hierarchical decoding strategy to leverage the multi-level structural characteristics of patents. For portrait-based analysis, we develop a comprehensive framework that employs keyphrase-based patent portraits to enable efficient and accurate patent analysis. Extensive experiments on benchmark datasets of keyphrase generation, the proposed model achieves significant improvements compared to state-of-the-art baselines. Further experiments conducted on real-world patent applications demonstrate that our keyphrase-based portraits effectively capture domain-specific knowledge and enrich semantic representation for patent analysis tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13076v1-abstract-full').style.display = 'none'; document.getElementById('2502.13076v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12961">arXiv:2502.12961</a> <span> [<a href="https://arxiv.org/pdf/2502.12961">pdf</a>, <a href="https://arxiv.org/format/2502.12961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Tool Use in Large Language Models with Meta-Cognition Trigger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dexun Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kuicai Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yasheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12961v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable emergent capabilities, transforming the execution of functional tasks by leveraging external tools for complex problems that require specialized processing or real-time data. While existing research expands LLMs access to diverse tools (e.g., program interpreters, search engines, weather/map apps), the necessity of using these tools is often overl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12961v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12961v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable emergent capabilities, transforming the execution of functional tasks by leveraging external tools for complex problems that require specialized processing or real-time data. While existing research expands LLMs access to diverse tools (e.g., program interpreters, search engines, weather/map apps), the necessity of using these tools is often overlooked, leading to indiscriminate tool invocation. This naive approach raises two key issues:(1) increased delays due to unnecessary tool calls, and (2) potential errors resulting from faulty interactions with external tools. In this paper, we introduce meta-cognition as a proxy for LLMs self-assessment of their capabilities, representing the model's awareness of its own limitations. Based on this, we propose MeCo, an adaptive decision-making strategy for external tool use. MeCo quantifies metacognitive scores by capturing high-level cognitive signals in the representation space, guiding when to invoke tools. Notably, MeCo is fine-tuning-free and incurs minimal cost. Our experiments show that MeCo accurately detects LLMs' internal cognitive signals and significantly improves tool-use decision-making across multiple base models and benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12961v1-abstract-full').style.display = 'none'; document.getElementById('2502.12961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12908">arXiv:2502.12908</a> <span> [<a href="https://arxiv.org/pdf/2502.12908">pdf</a>, <a href="https://arxiv.org/format/2502.12908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Graph Neural Networks for Databases: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziming Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Youhuan Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoliang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuxu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12908v2-abstract-short" style="display: inline;"> Graph neural networks (GNNs) are powerful deep learning models for graph-structured data, demonstrating remarkable success across diverse domains. Recently, the database (DB) community has increasingly recognized the potentiality of GNNs, prompting a surge of researches focusing on improving database systems through GNN-based approaches. However, despite notable advances, There is a lack of a comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12908v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12908v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12908v2-abstract-full" style="display: none;"> Graph neural networks (GNNs) are powerful deep learning models for graph-structured data, demonstrating remarkable success across diverse domains. Recently, the database (DB) community has increasingly recognized the potentiality of GNNs, prompting a surge of researches focusing on improving database systems through GNN-based approaches. However, despite notable advances, There is a lack of a comprehensive review and understanding of how GNNs could improve DB systems. Therefore, this survey aims to bridge this gap by providing a structured and in-depth overview of GNNs for DB systems. Specifically, we propose a new taxonomy that classifies existing methods into two key categories: (1) Relational Databases, which includes tasks like performance prediction, query optimization, and text-to-SQL, and (2) Graph Databases, addressing challenges like efficient graph query processing and graph similarity computation. We systematically review key methods in each category, highlighting their contributions and practical implications. Finally, we suggest promising avenues for integrating GNNs into Database systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12908v2-abstract-full').style.display = 'none'; document.getElementById('2502.12908v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A survey focus on GNNs and databases. 9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12513">arXiv:2502.12513</a> <span> [<a href="https://arxiv.org/pdf/2502.12513">pdf</a>, <a href="https://arxiv.org/format/2502.12513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RealSyn: An Effective and Scalable Multimodal Interleaved Document Transformation Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tiancheng Gu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaicheng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yin Xie</a>, <a href="/search/cs?searchtype=author&query=An%2C+X">Xiang An</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziyong Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongnan Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weidong Cai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12513v1-abstract-short" style="display: inline;"> After pre-training on extensive image-text pairs, Contrastive Language-Image Pre-training (CLIP) demonstrates promising performance on a wide variety of benchmarks. However, a substantial volume of non-paired data, such as multimodal interleaved documents, remains underutilized for vision-language representation learning. To fully leverage these unpaired documents, we initially establish a Real-Wo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12513v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12513v1-abstract-full" style="display: none;"> After pre-training on extensive image-text pairs, Contrastive Language-Image Pre-training (CLIP) demonstrates promising performance on a wide variety of benchmarks. However, a substantial volume of non-paired data, such as multimodal interleaved documents, remains underutilized for vision-language representation learning. To fully leverage these unpaired documents, we initially establish a Real-World Data Extraction pipeline to extract high-quality images and texts. Then we design a hierarchical retrieval method to efficiently associate each image with multiple semantically relevant realistic texts. To further enhance fine-grained visual information, we propose an image semantic augmented generation module for synthetic text production. Furthermore, we employ a semantic balance sampling strategy to improve dataset diversity, enabling better learning of long-tail concepts. Based on these innovations, we construct RealSyn, a dataset combining realistic and synthetic texts, available in three scales: 15M, 30M, and 100M. Extensive experiments demonstrate that RealSyn effectively advances vision-language representation learning and exhibits strong scalability. Models pre-trained on RealSyn achieve state-of-the-art performance on multiple downstream tasks. To facilitate future research, the RealSyn dataset and pre-trained model weights are released at https://github.com/deepglint/RealSyn. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12513v1-abstract-full').style.display = 'none'; document.getElementById('2502.12513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 12 figures, Webpage: https://garygutc.github.io/RealSyn</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12412">arXiv:2502.12412</a> <span> [<a href="https://arxiv.org/pdf/2502.12412">pdf</a>, <a href="https://arxiv.org/format/2502.12412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Incomplete Graph Learning: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+R">Riting Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huibo Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Anchen Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xueyan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunxu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12412v1-abstract-short" style="display: inline;"> Graph learning is a prevalent field that operates on ubiquitous graph data. Effective graph learning methods can extract valuable information from graphs. However, these methods are non-robust and affected by missing attributes in graphs, resulting in sub-optimal outcomes. This has led to the emergence of incomplete graph learning, which aims to process and learn from incomplete graphs to achieve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12412v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12412v1-abstract-full" style="display: none;"> Graph learning is a prevalent field that operates on ubiquitous graph data. Effective graph learning methods can extract valuable information from graphs. However, these methods are non-robust and affected by missing attributes in graphs, resulting in sub-optimal outcomes. This has led to the emergence of incomplete graph learning, which aims to process and learn from incomplete graphs to achieve more accurate and representative results. In this paper, we conducted a comprehensive review of the literature on incomplete graph learning. Initially, we categorize incomplete graphs and provide precise definitions of relevant concepts, terminologies, and techniques, thereby establishing a solid understanding for readers. Subsequently, we classify incomplete graph learning methods according to the types of incompleteness: (1) attribute-incomplete graph learning methods, (2) attribute-missing graph learning methods, and (3) hybrid-absent graph learning methods. By systematically classifying and summarizing incomplete graph learning methods, we highlight the commonalities and differences among existing approaches, aiding readers in selecting methods and laying the groundwork for further advancements. In addition, we summarize the datasets, incomplete processing modes, evaluation metrics, and application domains used by the current methods. Lastly, we discuss the current challenges and propose future directions for incomplete graph learning, with the aim of stimulating further innovations in this crucial field. To our knowledge, this is the first review dedicated to incomplete graph learning, aiming to offer valuable insights for researchers in related fields.We developed an online resource to follow relevant research based on this review, available at https://github.com/cherry-a11y/Incomplete-graph-learning.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12412v1-abstract-full').style.display = 'none'; document.getElementById('2502.12412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12123">arXiv:2502.12123</a> <span> [<a href="https://arxiv.org/pdf/2502.12123">pdf</a>, <a href="https://arxiv.org/format/2502.12123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the Query Complexity of Verifier-Assisted Language Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Botta%2C+E">Edoardo Botta</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuchen Li</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+A">Aashay Mehta</a>, <a href="/search/cs?searchtype=author&query=Ash%2C+J+T">Jordan T. Ash</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cyril Zhang</a>, <a href="/search/cs?searchtype=author&query=Risteski%2C+A">Andrej Risteski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12123v1-abstract-short" style="display: inline;"> Recently, a plethora of works have proposed inference-time algorithms (e.g. best-of-n), which incorporate verifiers to assist the generation process. Their quality-efficiency trade-offs have been empirically benchmarked on a variety of constrained generation tasks, but the algorithmic design landscape is still largely poorly understood. In this paper, we develop a mathematical framework for reason… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12123v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12123v1-abstract-full" style="display: none;"> Recently, a plethora of works have proposed inference-time algorithms (e.g. best-of-n), which incorporate verifiers to assist the generation process. Their quality-efficiency trade-offs have been empirically benchmarked on a variety of constrained generation tasks, but the algorithmic design landscape is still largely poorly understood. In this paper, we develop a mathematical framework for reasoning about constrained generation using a pre-trained language model generator oracle and a process verifier--which can decide whether a prefix can be extended to a string which satisfies the constraints of choice. We show that even in very simple settings, access to a verifier can render an intractable problem (information-theoretically or computationally) to a tractable one. In fact, we show even simple algorithms, like tokenwise rejection sampling, can enjoy significant benefits from access to a verifier. Empirically, we show that a natural modification of tokenwise rejection sampling, in which the sampler is allowed to "backtrack" (i.e., erase the final few generated tokens) has robust and substantive benefits over natural baselines (e.g. (blockwise) rejection sampling, nucleus sampling)--both in terms of computational efficiency, accuracy and diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12123v1-abstract-full').style.display = 'none'; document.getElementById('2502.12123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11903">arXiv:2502.11903</a> <span> [<a href="https://arxiv.org/pdf/2502.11903">pdf</a>, <a href="https://arxiv.org/format/2502.11903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MMRC: A Large-Scale Benchmark for Understanding Multimodal Large Language Model in Real-World Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+H">Haochen Xue</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+F">Feilong Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Ming Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yexin Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qidong Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yulong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chengzhi Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhongxing Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chun-Mei Feng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/cs?searchtype=author&query=Razzak%2C+I">Imran Razzak</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zongyuan Ge</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jionglong Su</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11903v1-abstract-short" style="display: inline;"> Recent multimodal large language models (MLLMs) have demonstrated significant potential in open-ended conversation, generating more accurate and personalized responses. However, their abilities to memorize, recall, and reason in sustained interactions within real-world scenarios remain underexplored. This paper introduces MMRC, a Multi-Modal Real-world Conversation benchmark for evaluating six cor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11903v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11903v1-abstract-full" style="display: none;"> Recent multimodal large language models (MLLMs) have demonstrated significant potential in open-ended conversation, generating more accurate and personalized responses. However, their abilities to memorize, recall, and reason in sustained interactions within real-world scenarios remain underexplored. This paper introduces MMRC, a Multi-Modal Real-world Conversation benchmark for evaluating six core open-ended abilities of MLLMs: information extraction, multi-turn reasoning, information update, image management, memory recall, and answer refusal. With data collected from real-world scenarios, MMRC comprises 5,120 conversations and 28,720 corresponding manually labeled questions, posing a significant challenge to existing MLLMs. Evaluations on 20 MLLMs in MMRC indicate an accuracy drop during open-ended interactions. We identify four common failure patterns: long-term memory degradation, inadequacies in updating factual knowledge, accumulated assumption of error propagation, and reluctance to say no. To mitigate these issues, we propose a simple yet effective NOTE-TAKING strategy, which can record key information from the conversation and remind the model during its responses, enhancing conversational capabilities. Experiments across six MLLMs demonstrate significant performance improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11903v1-abstract-full').style.display = 'none'; document.getElementById('2502.11903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11775">arXiv:2502.11775</a> <span> [<a href="https://arxiv.org/pdf/2502.11775">pdf</a>, <a href="https://arxiv.org/format/2502.11775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> video-SALMONN-o1: Reasoning-enhanced Audio-visual Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jimin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Changli Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=MA%2C+Z">Zejun MA</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11775v1-abstract-short" style="display: inline;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11775v1-abstract-full" style="display: none;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhanced audio-visual LLM designed for general video understanding tasks. To enhance its reasoning abilities, we develop a reasoning-intensive dataset featuring challenging audio-visual questions with step-by-step solutions. We also propose process direct preference optimization (pDPO), which leverages contrastive step selection to achieve efficient step-level reward modelling tailored for multimodal inputs. Additionally, we introduce RivaBench, the first reasoning-intensive video understanding benchmark, featuring over 4,000 high-quality, expert-curated question-answer pairs across scenarios such as standup comedy, academic presentations, and synthetic video detection. video-SALMONN-o1 achieves 3-8% accuracy improvements over the LLaVA-OneVision baseline across different video reasoning benchmarks. Besides, pDPO achieves 6-8% improvements compared to the supervised fine-tuning model on RivaBench. Enhanced reasoning enables video-SALMONN-o1 zero-shot synthetic video detection capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'none'; document.getElementById('2502.11775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11588">arXiv:2502.11588</a> <span> [<a href="https://arxiv.org/pdf/2502.11588">pdf</a>, <a href="https://arxiv.org/format/2502.11588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Unified Modeling Framework for Automated Penetration Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunfei Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shixuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Changling Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiandong Jin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Cheng Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11588v1-abstract-short" style="display: inline;"> The integration of artificial intelligence into automated penetration testing (AutoPT) has highlighted the necessity of simulation modeling for the training of intelligent agents, due to its cost-efficiency and swift feedback capabilities. Despite the proliferation of AutoPT research, there is a recognized gap in the availability of a unified framework for simulation modeling methods. This paper p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11588v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11588v1-abstract-full" style="display: none;"> The integration of artificial intelligence into automated penetration testing (AutoPT) has highlighted the necessity of simulation modeling for the training of intelligent agents, due to its cost-efficiency and swift feedback capabilities. Despite the proliferation of AutoPT research, there is a recognized gap in the availability of a unified framework for simulation modeling methods. This paper presents a systematic review and synthesis of existing techniques, introducing MDCPM to categorize studies based on literature objectives, network simulation complexity, dependency of technical and tactical operations, and scenario feedback and variation. To bridge the gap in unified method for multi-dimensional and multi-level simulation modeling, dynamic environment modeling, and the scarcity of public datasets, we introduce AutoPT-Sim, a novel modeling framework that based on policy automation and encompasses the combination of all sub dimensions. AutoPT-Sim offers a comprehensive approach to modeling network environments, attackers, and defenders, transcending the constraints of static modeling and accommodating networks of diverse scales. We publicly release a generated standard network environment dataset and the code of Network Generator. By integrating publicly available datasets flexibly, support is offered for various simulation modeling levels focused on policy automation in MDCPM and the network generator help researchers output customized target network data by adjusting parameters or fine-tuning the network generator. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11588v1-abstract-full').style.display = 'none'; document.getElementById('2502.11588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11345">arXiv:2502.11345</a> <span> [<a href="https://arxiv.org/pdf/2502.11345">pdf</a>, <a href="https://arxiv.org/format/2502.11345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Graph Topic Modeling with Topic Tree-based Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+D+C">Delvin Ce Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Menglin Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaobao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiasheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lauw%2C+H+W">Hady W. Lauw</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11345v1-abstract-short" style="display: inline;"> Textual documents are commonly connected in a hierarchical graph structure where a central document links to others with an exponentially growing connectivity. Though Hyperbolic Graph Neural Networks (HGNNs) excel at capturing such graph hierarchy, they cannot model the rich textual semantics within documents. Moreover, text contents in documents usually discuss topics of different specificity. Hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11345v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11345v1-abstract-full" style="display: none;"> Textual documents are commonly connected in a hierarchical graph structure where a central document links to others with an exponentially growing connectivity. Though Hyperbolic Graph Neural Networks (HGNNs) excel at capturing such graph hierarchy, they cannot model the rich textual semantics within documents. Moreover, text contents in documents usually discuss topics of different specificity. Hierarchical Topic Models (HTMs) discover such latent topic hierarchy within text corpora. However, most of them focus on the textual content within documents, and ignore the graph adjacency across interlinked documents. We thus propose a Hierarchical Graph Topic Modeling Transformer to integrate both topic hierarchy within documents and graph hierarchy across documents into a unified Transformer. Specifically, to incorporate topic hierarchy within documents, we design a topic tree and infer a hierarchical tree embedding for hierarchical topic modeling. To preserve both topic and graph hierarchies, we design our model in hyperbolic space and propose Hyperbolic Doubly Recurrent Neural Network, which models ancestral and fraternal tree structure. Both hierarchies are inserted into each Transformer layer to learn unified representations. Both supervised and unsupervised experiments verify the effectiveness of our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11345v1-abstract-full').style.display = 'none'; document.getElementById('2502.11345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11201">arXiv:2502.11201</a> <span> [<a href="https://arxiv.org/pdf/2502.11201">pdf</a>, <a href="https://arxiv.org/format/2502.11201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap: Enabling Natural Language Queries for NoSQL Databases through Text-to-NoSQL Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinwei Lu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhiqian Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haodi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+R+C">Raymond Chi-Wing Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11201v2-abstract-short" style="display: inline;"> NoSQL databases have become increasingly popular due to their outstanding performance in handling large-scale, unstructured, and semi-structured data, highlighting the need for user-friendly interfaces to bridge the gap between non-technical users and complex database queries. In this paper, we introduce the Text-to-NoSQL task, which aims to convert natural language queries into NoSQL queries, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11201v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11201v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11201v2-abstract-full" style="display: none;"> NoSQL databases have become increasingly popular due to their outstanding performance in handling large-scale, unstructured, and semi-structured data, highlighting the need for user-friendly interfaces to bridge the gap between non-technical users and complex database queries. In this paper, we introduce the Text-to-NoSQL task, which aims to convert natural language queries into NoSQL queries, thereby lowering the technical barrier for non-expert users. To promote research in this area, we developed a novel automated dataset construction process and released a large-scale and open-source dataset for this task, named TEND (short for Text-to-NoSQL Dataset). Additionally, we designed a SLM (Small Language Model)-assisted and RAG (Retrieval-augmented Generation)-assisted multi-step framework called SMART, which is specifically designed for Text-to-NoSQL conversion. To ensure comprehensive evaluation of the models, we also introduced a detailed set of metrics that assess the model's performance from both the query itself and its execution results. Our experimental results demonstrate the effectiveness of our approach and establish a benchmark for future research in this emerging field. We believe that our contributions will pave the way for more accessible and intuitive interactions with NoSQL databases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11201v2-abstract-full').style.display = 'none'; document.getElementById('2502.11201v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11179">arXiv:2502.11179</a> <span> [<a href="https://arxiv.org/pdf/2502.11179">pdf</a>, <a href="https://arxiv.org/format/2502.11179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RT-DEMT: A hybrid real-time acupoint detection model combining mamba and transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shilong Yang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Q">Qi Zang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chulong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yaoqin Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11179v1-abstract-short" style="display: inline;"> Traditional Chinese acupuncture methods often face controversy in clinical practice due to their high subjectivity. Additionally, current intelligent-assisted acupuncture systems have two major limitations: slow acupoint localization speed and low accuracy. To address these limitations, a new method leverages the excellent inference efficiency of the state-space model Mamba, while retaining the ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11179v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11179v1-abstract-full" style="display: none;"> Traditional Chinese acupuncture methods often face controversy in clinical practice due to their high subjectivity. Additionally, current intelligent-assisted acupuncture systems have two major limitations: slow acupoint localization speed and low accuracy. To address these limitations, a new method leverages the excellent inference efficiency of the state-space model Mamba, while retaining the advantages of the attention mechanism in the traditional DETR architecture, to achieve efficient global information integration and provide high-quality feature information for acupoint localization tasks. Furthermore, by employing the concept of residual likelihood estimation, it eliminates the need for complex upsampling processes, thereby accelerating the acupoint localization task. Our method achieved state-of-the-art (SOTA) accuracy on a private dataset of acupoints on the human back, with an average Euclidean distance pixel error (EPE) of 7.792 and an average time consumption of 10.05 milliseconds per localization task. Compared to the second-best algorithm, our method improved both accuracy and speed by approximately 14\%. This significant advancement not only enhances the efficacy of acupuncture treatment but also demonstrates the commercial potential of automated acupuncture robot systems. Access to our method is available at https://github.com/Sohyu1/RT-DEMT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11179v1-abstract-full').style.display = 'none'; document.getElementById('2502.11179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11083">arXiv:2502.11083</a> <span> [<a href="https://arxiv.org/pdf/2502.11083">pdf</a>, <a href="https://arxiv.org/format/2502.11083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Streamlining the Collaborative Chain of Models into A Single Forward Pass in Generation-Based Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yuanjie Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11083v1-abstract-short" style="display: inline;"> In Retrieval-Augmented Generation (RAG) and agent-based frameworks, the "Chain of Models" approach is widely used, where multiple specialized models work sequentially on distinct sub-tasks. This approach is effective but increases resource demands as each model must be deployed separately. Recent advancements attempt to address this by applying prompt tuning, which allows a shared base model to ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11083v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11083v1-abstract-full" style="display: none;"> In Retrieval-Augmented Generation (RAG) and agent-based frameworks, the "Chain of Models" approach is widely used, where multiple specialized models work sequentially on distinct sub-tasks. This approach is effective but increases resource demands as each model must be deployed separately. Recent advancements attempt to address this by applying prompt tuning, which allows a shared base model to adapt to multiple tasks with minimal parameter changes. However, a key challenge remains: intermediate outputs, passed between models as plain text, require recomputation of hidden states (i.e., Key and Value (KV) states in Transformers) during inference. In this paper, we introduce FTHSS, a novel prompt-tuning method that enables models to share KV hidden states, eliminating redundant forward passes and reducing KV cache storage. By modifying input and attention masks during training, FTHSS allows models to effectively utilize KV hidden states from prior models in both single- and multi-round scenarios. Empirical results on four tasks show that FTHSS matches the performance of traditional model chains while improving inference efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11083v1-abstract-full').style.display = 'none'; document.getElementById('2502.11083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11075">arXiv:2502.11075</a> <span> [<a href="https://arxiv.org/pdf/2502.11075">pdf</a>, <a href="https://arxiv.org/format/2502.11075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exposing Numeracy Gaps: A Benchmark to Evaluate Fundamental Numerical Abilities in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuejia Chen</a>, <a href="/search/cs?searchtype=author&query=XU%2C+Z">Zhanchao XU</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Darian Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+N">Nicole Hu</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+F">Fei Teng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Luyu Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+J">Chen Jason Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11075v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive capabilities in natural language processing tasks, such as text generation and semantic understanding. However, their performance on numerical reasoning tasks, such as basic arithmetic, numerical retrieval, and magnitude comparison, remains surprisingly poor. This gap arises from their reliance on surface-level statistical patterns rather t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11075v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11075v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive capabilities in natural language processing tasks, such as text generation and semantic understanding. However, their performance on numerical reasoning tasks, such as basic arithmetic, numerical retrieval, and magnitude comparison, remains surprisingly poor. This gap arises from their reliance on surface-level statistical patterns rather than understanding numbers as continuous magnitudes. Existing benchmarks primarily focus on either linguistic competence or structured mathematical problem-solving, neglecting fundamental numerical reasoning required in real-world scenarios. To bridge this gap, we propose NumericBench, a comprehensive benchmark to evaluate six fundamental numerical capabilities: number recognition, arithmetic operations, contextual retrieval, comparison, summary, and logical reasoning. NumericBench includes datasets ranging from synthetic number lists to the crawled real-world data, addressing challenges like long contexts, noise, and multi-step reasoning. Extensive experiments on state-of-the-art LLMs, including GPT-4 and DeepSeek, reveal persistent weaknesses in numerical reasoning, highlighting the urgent need to improve numerically-aware language modeling. The benchmark is released in: https://github.com/TreeAI-Lab/NumericBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11075v1-abstract-full').style.display = 'none'; document.getElementById('2502.11075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11022">arXiv:2502.11022</a> <span> [<a href="https://arxiv.org/pdf/2502.11022">pdf</a>, <a href="https://arxiv.org/format/2502.11022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MultiTEND: A Multilingual Benchmark for Natural Language to NoSQL Query Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhiqian Qin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinwei Lu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanwei Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaimin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+J">Chen Jason Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11022v1-abstract-short" style="display: inline;"> Natural language interfaces for NoSQL databases are increasingly vital in the big data era, enabling users to interact with complex, unstructured data without deep technical expertise. However, most recent advancements focus on English, leaving a gap for multilingual support. This paper introduces MultiTEND, the first and largest multilingual benchmark for natural language to NoSQL query generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11022v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11022v1-abstract-full" style="display: none;"> Natural language interfaces for NoSQL databases are increasingly vital in the big data era, enabling users to interact with complex, unstructured data without deep technical expertise. However, most recent advancements focus on English, leaving a gap for multilingual support. This paper introduces MultiTEND, the first and largest multilingual benchmark for natural language to NoSQL query generation, covering six languages: English, German, French, Russian, Japanese and Mandarin Chinese. Using MultiTEND, we analyze challenges in translating natural language to NoSQL queries across diverse linguistic structures, including lexical and syntactic differences. Experiments show that performance accuracy in both English and non-English settings remains relatively low, with a 4%-6% gap across scenarios like fine-tuned SLM, zero-shot LLM, and RAG for LLM. To address the aforementioned challenges, we introduce MultiLink, a novel framework that bridges the multilingual input to NoSQL query generation gap through a Parallel Linking Process. It breaks down the task into multiple steps, integrating parallel multilingual processing, Chain-of-Thought (CoT) reasoning, and Retrieval-Augmented Generation (RAG) to tackle lexical and structural challenges inherent in multilingual NoSQL generation. MultiLink shows enhancements in all metrics for every language against the top baseline, boosting execution accuracy by about 15% for English and averaging a 10% improvement for non-English languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11022v1-abstract-full').style.display = 'none'; document.getElementById('2502.11022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10959">arXiv:2502.10959</a> <span> [<a href="https://arxiv.org/pdf/2502.10959">pdf</a>, <a href="https://arxiv.org/format/2502.10959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Revisiting the Design of In-Memory Dynamic Graph Storage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+J">Jixian Su</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+C">Chiyu Hao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shixuan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Sen Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiaxin Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyi Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingsheng He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Minyi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10959v1-abstract-short" style="display: inline;"> The effectiveness of in-memory dynamic graph storage (DGS) for supporting concurrent graph read and write queries is crucial for real-time graph analytics and updates. Various methods have been proposed, for example, LLAMA, Aspen, LiveGraph, Teseo, and Sortledton. These approaches differ significantly in their support for read and write operations, space overhead, and concurrency control. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10959v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10959v1-abstract-full" style="display: none;"> The effectiveness of in-memory dynamic graph storage (DGS) for supporting concurrent graph read and write queries is crucial for real-time graph analytics and updates. Various methods have been proposed, for example, LLAMA, Aspen, LiveGraph, Teseo, and Sortledton. These approaches differ significantly in their support for read and write operations, space overhead, and concurrency control. However, there has been no systematic study to explore the trade-offs among these dimensions. In this paper, we evaluate the effectiveness of individual techniques and identify the performance factors affecting these storage methods by proposing a common abstraction for DGS design and implementing a generic test framework based on this abstraction. Our findings highlight several key insights: 1) Existing DGS methods exhibit substantial space overhead. For example, Aspen consumes 3.3-10.8x more memory than CSR, while the optimal fine-grained methods consume 4.1-8.9x more memory than CSR, indicating a significant memory overhead. 2) Existing methods often overlook memory access impact of modern architectures, leading to performance degradation compared to continuous storage methods. 3) Fine-grained concurrency control methods, in particular, suffer from severe efficiency and space issues due to maintaining versions and performing checks for each neighbor. These methods also experience significant contention on high-degree vertices. Our systematic study reveals these performance bottlenecks and outlines future directions to improve DGS for real-time graph analytics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10959v1-abstract-full').style.display = 'none'; document.getElementById('2502.10959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10830">arXiv:2502.10830</a> <span> [<a href="https://arxiv.org/pdf/2502.10830">pdf</a>, <a href="https://arxiv.org/format/2502.10830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713721">10.1145/3706598.3713721 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713721">10.1145/3706598.3713721 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SpellRing: Recognizing Continuous Fingerspelling in American Sign Language using a Ring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+H">Hyunchul Lim</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+N+A">Nam Anh Dang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dylan Lee</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T+C">Tianhong Catherine Yu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jane Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F+M">Franklin Mingzhe Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yiqi Jin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yan Ma</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiaojun Bi</a>, <a href="/search/cs?searchtype=author&query=Guimbreti%C3%A8re%2C+F">Fran莽ois Guimbreti猫re</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10830v1-abstract-short" style="display: inline;"> Fingerspelling is a critical part of American Sign Language (ASL) recognition and has become an accessible optional text entry method for Deaf and Hard of Hearing (DHH) individuals. In this paper, we introduce SpellRing, a single smart ring worn on the thumb that recognizes words continuously fingerspelled in ASL. SpellRing uses active acoustic sensing (via a microphone and speaker) and an inertia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10830v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10830v1-abstract-full" style="display: none;"> Fingerspelling is a critical part of American Sign Language (ASL) recognition and has become an accessible optional text entry method for Deaf and Hard of Hearing (DHH) individuals. In this paper, we introduce SpellRing, a single smart ring worn on the thumb that recognizes words continuously fingerspelled in ASL. SpellRing uses active acoustic sensing (via a microphone and speaker) and an inertial measurement unit (IMU) to track handshape and movement, which are processed through a deep learning algorithm using Connectionist Temporal Classification (CTC) loss. We evaluated the system with 20 ASL signers (13 fluent and 7 learners), using the MacKenzie-Soukoref Phrase Set of 1,164 words and 100 phrases. Offline evaluation yielded top-1 and top-5 word recognition accuracies of 82.45% (9.67%) and 92.42% (5.70%), respectively. In real-time, the system achieved a word error rate (WER) of 0.099 (0.039) on the phrases. Based on these results, we discuss key lessons and design implications for future minimally obtrusive ASL recognition wearables. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10830v1-abstract-full').style.display = 'none'; document.getElementById('2502.10830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CHI Conference on Human Factors in Computing Systems (CHI 2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10713">arXiv:2502.10713</a> <span> [<a href="https://arxiv.org/pdf/2502.10713">pdf</a>, <a href="https://arxiv.org/format/2502.10713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving action segmentation via explicit similarity measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aouaidjia%2C+K">Kamel Aouaidjia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aofan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chongsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10713v1-abstract-short" style="display: inline;"> Existing supervised action segmentation methods depend on the quality of frame-wise classification using attention mechanisms or temporal convolutions to capture temporal dependencies. Even boundary detection-based methods primarily depend on the accuracy of an initial frame-wise classification, which can overlook precise identification of segments and boundaries in case of low-quality prediction.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10713v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10713v1-abstract-full" style="display: none;"> Existing supervised action segmentation methods depend on the quality of frame-wise classification using attention mechanisms or temporal convolutions to capture temporal dependencies. Even boundary detection-based methods primarily depend on the accuracy of an initial frame-wise classification, which can overlook precise identification of segments and boundaries in case of low-quality prediction. To address this problem, this paper proposes ASESM (Action Segmentation via Explicit Similarity Measurement) to enhance the segmentation accuracy by incorporating explicit similarity evaluation across frames and predictions. Our supervised learning architecture uses frame-level multi-resolution features as input to multiple Transformer encoders. The resulting multiple frame-wise predictions are used for similarity voting to obtain high quality initial prediction. We apply a newly proposed boundary correction algorithm that operates based on feature similarity between consecutive frames to adjust the boundary locations iteratively through the learning process. The corrected prediction is then further refined through multiple stages of temporal convolutions. As post-processing, we optionally apply boundary correction again followed by a segment smoothing method that removes outlier classes within segments using similarity measurement between consecutive predictions. Additionally, we propose a fully unsupervised boundary detection-correction algorithm that identifies segment boundaries based solely on feature similarity without any training. Experiments on 50Salads, GTEA, and Breakfast datasets show the effectiveness of both the supervised and unsupervised algorithms. Code and models are made available on Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10713v1-abstract-full').style.display = 'none'; document.getElementById('2502.10713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10641">arXiv:2502.10641</a> <span> [<a href="https://arxiv.org/pdf/2502.10641">pdf</a>, <a href="https://arxiv.org/format/2502.10641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Toward Equitable Access: Leveraging Crowdsourced Reviews to Investigate Public Perceptions of Health Resource Accessibility </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhaoqian Xue</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guanhong Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kai Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingcheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songhua Hu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+W">Wenyue Hua</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lizhou Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingyao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10641v1-abstract-short" style="display: inline;"> Access to health resources is a critical determinant of public well-being and societal resilience, particularly during public health crises when demand for medical services and preventive care surges. However, disparities in accessibility persist across demographic and geographic groups, raising concerns about equity. Traditional survey methods often fall short due to limitations in coverage, cost… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10641v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10641v1-abstract-full" style="display: none;"> Access to health resources is a critical determinant of public well-being and societal resilience, particularly during public health crises when demand for medical services and preventive care surges. However, disparities in accessibility persist across demographic and geographic groups, raising concerns about equity. Traditional survey methods often fall short due to limitations in coverage, cost, and timeliness. This study leverages crowdsourced data from Google Maps reviews, applying advanced natural language processing techniques, specifically ModernBERT, to extract insights on public perceptions of health resource accessibility in the United States during the COVID-19 pandemic. Additionally, we employ Partial Least Squares regression to examine the relationship between accessibility perceptions and key socioeconomic and demographic factors including political affiliation, racial composition, and educational attainment. Our findings reveal that public perceptions of health resource accessibility varied significantly across the U.S., with disparities peaking during the pandemic and slightly easing post-crisis. Political affiliation, racial demographics, and education levels emerged as key factors shaping these perceptions. These findings underscore the need for targeted interventions and policy measures to address inequities, fostering a more inclusive healthcare infrastructure that can better withstand future public health challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10641v1-abstract-full').style.display = 'none'; document.getElementById('2502.10641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10389">arXiv:2502.10389</a> <span> [<a href="https://arxiv.org/pdf/2502.10389">pdf</a>, <a href="https://arxiv.org/format/2502.10389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Region-Adaptive Sampling for Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziming Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengruidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lili Qiu</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yang You</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10389v1-abstract-short" style="display: inline;"> Diffusion models (DMs) have become the leading choice for generative tasks across diverse domains. However, their reliance on multiple sequential forward passes significantly limits real-time performance. Previous acceleration methods have primarily focused on reducing the number of sampling steps or reusing intermediate results, failing to leverage variations across spatial regions within the ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10389v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10389v1-abstract-full" style="display: none;"> Diffusion models (DMs) have become the leading choice for generative tasks across diverse domains. However, their reliance on multiple sequential forward passes significantly limits real-time performance. Previous acceleration methods have primarily focused on reducing the number of sampling steps or reusing intermediate results, failing to leverage variations across spatial regions within the image due to the constraints of convolutional U-Net structures. By harnessing the flexibility of Diffusion Transformers (DiTs) in handling variable number of tokens, we introduce RAS, a novel, training-free sampling strategy that dynamically assigns different sampling ratios to regions within an image based on the focus of the DiT model. Our key observation is that during each sampling step, the model concentrates on semantically meaningful regions, and these areas of focus exhibit strong continuity across consecutive steps. Leveraging this insight, RAS updates only the regions currently in focus, while other regions are updated using cached noise from the previous step. The model's focus is determined based on the output from the preceding step, capitalizing on the temporal consistency we observed. We evaluate RAS on Stable Diffusion 3 and Lumina-Next-T2I, achieving speedups up to 2.36x and 2.51x, respectively, with minimal degradation in generation quality. Additionally, a user study reveals that RAS delivers comparable qualities under human evaluation while achieving a 1.6x speedup. Our approach makes a significant step towards more efficient diffusion transformers, enhancing their potential for real-time applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10389v1-abstract-full').style.display = 'none'; document.getElementById('2502.10389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09749">arXiv:2502.09749</a> <span> [<a href="https://arxiv.org/pdf/2502.09749">pdf</a>, <a href="https://arxiv.org/format/2502.09749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Vote-Tree-Planner: Optimizing Execution Order in LLM-based Task Planning Pipeline via Voting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaowei Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wentao Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09749v1-abstract-short" style="display: inline;"> Integrating large language models (LLMs) into closed-loop robotic task planning has become increasingly popular within embodied artificial intelligence. Previous efforts mainly focused on leveraging the strong reasoning abilities of LLMs to enhance task planning performance while often overlooking task planning efficiency and executability due to repetitive queries to LLMs. This paper addresses th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09749v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09749v1-abstract-full" style="display: none;"> Integrating large language models (LLMs) into closed-loop robotic task planning has become increasingly popular within embodied artificial intelligence. Previous efforts mainly focused on leveraging the strong reasoning abilities of LLMs to enhance task planning performance while often overlooking task planning efficiency and executability due to repetitive queries to LLMs. This paper addresses the synergy between LLMs and task planning systems, aiming to minimize redundancy while enhancing planning effectiveness. Specifically, building upon Prog-Prompt and the high-level concept of Tree-Planner, we propose Vote-Tree-Planner. This sampling strategy utilizes votes to guide plan traversal during the decision-making process. Our approach is motivated by a straightforward observation: assigning weights to agents during decision-making enables the evaluation of critical paths before execution. With this simple vote-tree construction, our method further improves the success rate and reduces the number of queries to LLMs. The experimental results highlight that our Vote-Tree-Planner demonstrates greater stability and shows a higher average success rate and goal condition recall on the unseen dataset compared with previous baseline methods. These findings underscore the potential of the Vote-Tree-Planner to enhance planning accuracy, reliability, and efficiency in LLM-based planning systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09749v1-abstract-full').style.display = 'none'; document.getElementById('2502.09749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to RSS24-W: TaskSpec</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09635">arXiv:2502.09635</a> <span> [<a href="https://arxiv.org/pdf/2502.09635">pdf</a>, <a href="https://arxiv.org/format/2502.09635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CORRECT: Context- and Reference-Augmented Reasoning and Prompting for Fact-Checking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+D+C">Delvin Ce Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongwon Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09635v1-abstract-short" style="display: inline;"> Fact-checking the truthfulness of claims usually requires reasoning over multiple evidence sentences. Oftentimes, evidence sentences may not be always self-contained, and may require additional contexts and references from elsewhere to understand coreferential expressions, acronyms, and the scope of a reported finding. For example, evidence sentences from an academic paper may need contextual sent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09635v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09635v1-abstract-full" style="display: none;"> Fact-checking the truthfulness of claims usually requires reasoning over multiple evidence sentences. Oftentimes, evidence sentences may not be always self-contained, and may require additional contexts and references from elsewhere to understand coreferential expressions, acronyms, and the scope of a reported finding. For example, evidence sentences from an academic paper may need contextual sentences in the paper and descriptions in its cited papers to determine the scope of a research discovery. However, most fact-checking models mainly focus on the reasoning within evidence sentences, and ignore the auxiliary contexts and references. To address this problem, we propose a novel method, Context- and Reference-augmented Reasoning and Prompting. For evidence reasoning, we construct a three-layer evidence graph with evidence, context, and reference layers. We design intra- and cross-layer reasoning to integrate three graph layers into a unified evidence embedding. For verdict prediction, we design evidence-conditioned prompt encoder, which produces unique prompt embeddings for each claim. These evidence-conditioned prompt embeddings and claims are unified for fact-checking. Experiments verify the strength of our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09635v1-abstract-full').style.display = 'none'; document.getElementById('2502.09635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL-25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09100">arXiv:2502.09100</a> <span> [<a href="https://arxiv.org/pdf/2502.09100">pdf</a>, <a href="https://arxiv.org/format/2502.09100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Logical Reasoning in Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanmeng Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhizhang Fu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Mengru Ding</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+R">Ruoxi Ning</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoli Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaozhang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09100v1-abstract-short" style="display: inline;"> With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities. However, their ability to perform rigorous logical reasoning remains an open question. This survey synthesizes recent advancements in logical reasoning within LLMs, a critical area of AI research. It outlines the scope of logical reasonin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09100v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09100v1-abstract-full" style="display: none;"> With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities. However, their ability to perform rigorous logical reasoning remains an open question. This survey synthesizes recent advancements in logical reasoning within LLMs, a critical area of AI research. It outlines the scope of logical reasoning in LLMs, its theoretical foundations, and the benchmarks used to evaluate reasoning proficiency. We analyze existing capabilities across different reasoning paradigms - deductive, inductive, abductive, and analogical - and assess strategies to enhance reasoning performance, including data-centric tuning, reinforcement learning, decoding strategies, and neuro-symbolic approaches. The review concludes with future directions, emphasizing the need for further exploration to strengthen logical reasoning in AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09100v1-abstract-full').style.display = 'none'; document.getElementById('2502.09100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08987">arXiv:2502.08987</a> <span> [<a href="https://arxiv.org/pdf/2502.08987">pdf</a>, <a href="https://arxiv.org/format/2502.08987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neural Force Field: Learning Generalized Physical Representation from a Few Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiqian Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+R">Ruihong Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08987v2-abstract-short" style="display: inline;"> Physical reasoning is a remarkable human ability that enables rapid learning and generalization from limited experience. Current AI models, despite extensive training, still struggle to achieve similar generalization, especially in Out-of-distribution (OOD) settings. This limitation stems from their inability to abstract core physical principles from observations. A key challenge is developing rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08987v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08987v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08987v2-abstract-full" style="display: none;"> Physical reasoning is a remarkable human ability that enables rapid learning and generalization from limited experience. Current AI models, despite extensive training, still struggle to achieve similar generalization, especially in Out-of-distribution (OOD) settings. This limitation stems from their inability to abstract core physical principles from observations. A key challenge is developing representations that can efficiently learn and generalize physical dynamics from minimal data. Here we present Neural Force Field (NFF) a modeling framework built on Neural Ordinary Differential Equation (NODE) that learns interpretable force field representations which can be efficiently integrated through an Ordinary Differential Equation ( ODE) solver to predict object trajectories. Unlike existing approaches that rely on high-dimensional latent spaces, NFF captures fundamental physical concepts such as gravity, support, and collision in an interpretable manner. Experiments on two challenging physical reasoning tasks demonstrate that NFF, trained with only a few examples, achieves strong generalization to unseen scenarios. This physics-grounded representation enables efficient forward-backward planning and rapid adaptation through interactive refinement. Our work suggests that incorporating physics-inspired representations into learning systems can help bridge the gap between artificial and human physical reasoning capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08987v2-abstract-full').style.display = 'none'; document.getElementById('2502.08987v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08958">arXiv:2502.08958</a> <span> [<a href="https://arxiv.org/pdf/2502.08958">pdf</a>, <a href="https://arxiv.org/format/2502.08958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Biologically Plausible Brain Graph Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Ciyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuelong Huang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qichao Dong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yaochu Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08958v1-abstract-short" style="display: inline;"> State-of-the-art brain graph analysis methods fail to fully encode the small-world architecture of brain graphs (accompanied by the presence of hubs and functional modules), and therefore lack biological plausibility to some extent. This limitation hinders their ability to accurately represent the brain's structural and functional properties, thereby restricting the effectiveness of machine learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08958v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08958v1-abstract-full" style="display: none;"> State-of-the-art brain graph analysis methods fail to fully encode the small-world architecture of brain graphs (accompanied by the presence of hubs and functional modules), and therefore lack biological plausibility to some extent. This limitation hinders their ability to accurately represent the brain's structural and functional properties, thereby restricting the effectiveness of machine learning models in tasks such as brain disorder detection. In this work, we propose a novel Biologically Plausible Brain Graph Transformer (BioBGT) that encodes the small-world architecture inherent in brain graphs. Specifically, we present a network entanglement-based node importance encoding technique that captures the structural importance of nodes in global information propagation during brain graph communication, highlighting the biological properties of the brain structure. Furthermore, we introduce a functional module-aware self-attention to preserve the functional segregation and integration characteristics of brain graphs in the learned representations. Experimental results on three benchmark datasets demonstrate that BioBGT outperforms state-of-the-art models, enhancing biologically plausible brain graph representations for various brain graph analytical tasks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08958v1-abstract-full').style.display = 'none'; document.getElementById('2502.08958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27pages, 16figures, published as a conference paper at ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08902">arXiv:2502.08902</a> <span> [<a href="https://arxiv.org/pdf/2502.08902">pdf</a>, <a href="https://arxiv.org/format/2502.08902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoL3D: Collaborative Learning of Single-view Depth and Camera Intrinsics for Metric 3D Shape Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lubin Fan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shen Cao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bojian Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08902v1-abstract-short" style="display: inline;"> Recovering the metric 3D shape from a single image is particularly relevant for robotics and embodied intelligence applications, where accurate spatial understanding is crucial for navigation and interaction with environments. Usually, the mainstream approaches achieve it through monocular depth estimation. However, without camera intrinsics, the 3D metric shape can not be recovered from depth alo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08902v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08902v1-abstract-full" style="display: none;"> Recovering the metric 3D shape from a single image is particularly relevant for robotics and embodied intelligence applications, where accurate spatial understanding is crucial for navigation and interaction with environments. Usually, the mainstream approaches achieve it through monocular depth estimation. However, without camera intrinsics, the 3D metric shape can not be recovered from depth alone. In this study, we theoretically demonstrate that depth serves as a 3D prior constraint for estimating camera intrinsics and uncover the reciprocal relations between these two elements. Motivated by this, we propose a collaborative learning framework for jointly estimating depth and camera intrinsics, named CoL3D, to learn metric 3D shapes from single images. Specifically, CoL3D adopts a unified network and performs collaborative optimization at three levels: depth, camera intrinsics, and 3D point clouds. For camera intrinsics, we design a canonical incidence field mechanism as a prior that enables the model to learn the residual incident field for enhanced calibration. Additionally, we incorporate a shape similarity measurement loss in the point cloud space, which improves the quality of 3D shapes essential for robotic applications. As a result, when training and testing on a single dataset with in-domain settings, CoL3D delivers outstanding performance in both depth estimation and camera calibration across several indoor and outdoor benchmark datasets, which leads to remarkable 3D shape quality for the perception capabilities of robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08902v1-abstract-full').style.display = 'none'; document.getElementById('2502.08902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08428">arXiv:2502.08428</a> <span> [<a href="https://arxiv.org/pdf/2502.08428">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Robot-Initiated Social Control of Sedentary Behavior: Comparing the Impact of Relationship- and Target-Focused Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiaxin Xu</a>, <a href="/search/cs?searchtype=author&query=van+der+Horst%2C+S+A+M">Sterre Anna Mariam van der Horst</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cuijpers%2C+R+H">Raymond H. Cuijpers</a>, <a href="/search/cs?searchtype=author&query=IJsselsteijn%2C+W+A">Wijnand A. IJsselsteijn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08428v1-abstract-short" style="display: inline;"> To design social robots to effectively promote health behavior change, it is essential to understand how people respond to various health communication strategies employed by these robots. This study examines the effectiveness of two types of social control strategies from a social robot, relationship-focused strategies (emphasizing relational consequences) and target-focused strategies (emphasizi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08428v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08428v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08428v1-abstract-full" style="display: none;"> To design social robots to effectively promote health behavior change, it is essential to understand how people respond to various health communication strategies employed by these robots. This study examines the effectiveness of two types of social control strategies from a social robot, relationship-focused strategies (emphasizing relational consequences) and target-focused strategies (emphasizing health consequences), in encouraging people to reduce sedentary behavior. A two-session lab experiment was conducted (n = 135), where participants first played a game with a robot, followed by the robot persuading them to stand up and move using one of the strategies. Half of the participants joined a second session to have a repeated interaction with the robot. Results showed that relationship-focused strategies motivated participants to stay active longer. Repeated sessions did not strengthen participants' relationship with the robot, but those who felt more attached to the robot responded more actively to the target-focused strategies. These findings offer valuable insights for designing persuasive strategies for social robots in health communication contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08428v1-abstract-full').style.display = 'none'; document.getElementById('2502.08428v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to HRI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08281">arXiv:2502.08281</a> <span> [<a href="https://arxiv.org/pdf/2502.08281">pdf</a>, <a href="https://arxiv.org/ps/2502.08281">ps</a>, <a href="https://arxiv.org/format/2502.08281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Redefining Simplicity: Benchmarking Large Language Models from Lexical to Document Simplification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiang%2C+J">Jipeng Qiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minjiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yunhao Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kui Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08281v1-abstract-short" style="display: inline;"> Text simplification (TS) refers to the process of reducing the complexity of a text while retaining its original meaning and key information. Existing work only shows that large language models (LLMs) have outperformed supervised non-LLM-based methods on sentence simplification. This study offers the first comprehensive analysis of LLM performance across four TS tasks: lexical, syntactic, sentence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08281v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08281v1-abstract-full" style="display: none;"> Text simplification (TS) refers to the process of reducing the complexity of a text while retaining its original meaning and key information. Existing work only shows that large language models (LLMs) have outperformed supervised non-LLM-based methods on sentence simplification. This study offers the first comprehensive analysis of LLM performance across four TS tasks: lexical, syntactic, sentence, and document simplification. We compare lightweight, closed-source and open-source LLMs against traditional non-LLM methods using automatic metrics and human evaluations. Our experiments reveal that LLMs not only outperform non-LLM approaches in all four tasks but also often generate outputs that exceed the quality of existing human-annotated references. Finally, we present some future directions of TS in the era of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08281v1-abstract-full').style.display = 'none'; document.getElementById('2502.08281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07968">arXiv:2502.07968</a> <span> [<a href="https://arxiv.org/pdf/2502.07968">pdf</a>, <a href="https://arxiv.org/format/2502.07968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative Risk Minimization for Out-of-Distribution Generalization on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yaochen Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuxu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07968v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) generalization on graphs aims at dealing with scenarios where the test graph distribution differs from the training graph distributions. Compared to i.i.d. data like images, the OOD generalization problem on graph-structured data remains challenging due to the non-i.i.d. property and complex structural information on graphs. Recently, several works on graph OOD generaliza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07968v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07968v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) generalization on graphs aims at dealing with scenarios where the test graph distribution differs from the training graph distributions. Compared to i.i.d. data like images, the OOD generalization problem on graph-structured data remains challenging due to the non-i.i.d. property and complex structural information on graphs. Recently, several works on graph OOD generalization have explored extracting invariant subgraphs that share crucial classification information across different distributions. Nevertheless, such a strategy could be suboptimal for entirely capturing the invariant information, as the extraction of discrete structures could potentially lead to the loss of invariant information or the involvement of spurious information. In this paper, we propose an innovative framework, named Generative Risk Minimization (GRM), designed to generate an invariant subgraph for each input graph to be classified, instead of extraction. To address the challenge of optimization in the absence of optimal invariant subgraphs (i.e., ground truths), we derive a tractable form of the proposed GRM objective by introducing a latent causal variable, and its effectiveness is validated by our theoretical analysis. We further conduct extensive experiments across a variety of real-world graph datasets for both node-level and graph-level OOD generalization, and the results demonstrate the superiority of our framework GRM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07968v1-abstract-full').style.display = 'none'; document.getElementById('2502.07968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TMLR 02/2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07963">arXiv:2502.07963</a> <span> [<a href="https://arxiv.org/pdf/2502.07963">pdf</a>, <a href="https://arxiv.org/format/2502.07963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Caught in the Web of Words: Do LLMs Fall for Spin in Medical Literature? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yun%2C+H+S">Hye Sun Yun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K+Y+C">Karen Y. C. Zhang</a>, <a href="/search/cs?searchtype=author&query=Kouzy%2C+R">Ramez Kouzy</a>, <a href="/search/cs?searchtype=author&query=Marshall%2C+I+J">Iain J. Marshall</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J+J">Junyi Jessy Li</a>, <a href="/search/cs?searchtype=author&query=Wallace%2C+B+C">Byron C. Wallace</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07963v1-abstract-short" style="display: inline;"> Medical research faces well-documented challenges in translating novel treatments into clinical practice. Publishing incentives encourage researchers to present "positive" findings, even when empirical results are equivocal. Consequently, it is well-documented that authors often spin study results, especially in article abstracts. Such spin can influence clinician interpretation of evidence and ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07963v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07963v1-abstract-full" style="display: none;"> Medical research faces well-documented challenges in translating novel treatments into clinical practice. Publishing incentives encourage researchers to present "positive" findings, even when empirical results are equivocal. Consequently, it is well-documented that authors often spin study results, especially in article abstracts. Such spin can influence clinician interpretation of evidence and may affect patient care decisions. In this study, we ask whether the interpretation of trial results offered by Large Language Models (LLMs) is similarly affected by spin. This is important since LLMs are increasingly being used to trawl through and synthesize published medical evidence. We evaluated 22 LLMs and found that they are across the board more susceptible to spin than humans. They might also propagate spin into their outputs: We find evidence, e.g., that LLMs implicitly incorporate spin into plain language summaries that they generate. We also find, however, that LLMs are generally capable of recognizing spin, and can be prompted in a way to mitigate spin's impact on LLM outputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07963v1-abstract-full').style.display = 'none'; document.getElementById('2502.07963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 10 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07901">arXiv:2502.07901</a> <span> [<a href="https://arxiv.org/pdf/2502.07901">pdf</a>, <a href="https://arxiv.org/format/2502.07901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> StarCast: A Secure and Spectrum-Efficient Group Communication Scheme for LEO Satellite Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hexuan Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shanghao Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yi Shi</a>, <a href="/search/cs?searchtype=author&query=Burger%2C+E">Eric Burger</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y+T">Y. Thomas Hou</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+W">Wenjing Lou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07901v1-abstract-short" style="display: inline;"> Low Earth Orbit (LEO) satellite networks serve as a cornerstone infrastructure for providing ubiquitous connectivity in areas where terrestrial infrastructure is unavailable. With the emergence of Direct-to-Cell (DTC) satellites, these networks can provide direct access to mobile phones and IoT devices without relying on terrestrial base stations, leading to a surge in massive connectivity demands… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07901v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07901v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07901v1-abstract-full" style="display: none;"> Low Earth Orbit (LEO) satellite networks serve as a cornerstone infrastructure for providing ubiquitous connectivity in areas where terrestrial infrastructure is unavailable. With the emergence of Direct-to-Cell (DTC) satellites, these networks can provide direct access to mobile phones and IoT devices without relying on terrestrial base stations, leading to a surge in massive connectivity demands for the serving satellite. To address this issue, group communication is an effective paradigm that enables simultaneous content delivery to multiple users and thus optimizes bandwidth reuse. Although extensive research has been conducted to improve group communication performance, securing this communication without compromising its inherent spectrum efficiency remains a critical challenge. To address this, we introduce StarCast, a secure group encryption scheme for LEO satellite networks. Our solution leverages ciphertext-policy attribute-based encryption (CP-ABE) to implement fine-grained access control by embedding access policies directly within the ciphertext. Unlike standard secure communication approaches that require dedicated per-user channels and significantly deplete limited satellite spectrum resources, StarCast maintains efficient spectrum reuse within user groups while ensuring that only authorized users can access transmitted data. Additionally, it significantly reduces the costly key management overhead associated with conventional encryption schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07901v1-abstract-full').style.display = 'none'; document.getElementById('2502.07901v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07837">arXiv:2502.07837</a> <span> [<a href="https://arxiv.org/pdf/2502.07837">pdf</a>, <a href="https://arxiv.org/format/2502.07837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RoboBERT: An End-to-end Multimodal Robotic Manipulation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jianhua Shan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haozhang Gao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Hailiang Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kairos Wong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jie Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+B">Bin Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07837v1-abstract-short" style="display: inline;"> Embodied intelligence integrates multiple modalities, enabling agents to understand images, language, and actions simultaneously. However, existing models always depend on additional datasets or extensive pre-training to maximize performance improvements, consuming abundant training time and expensive hardware cost. To tackle this issue, we present RoboBERT, a novel end-to-end robotic manipulation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07837v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07837v1-abstract-full" style="display: none;"> Embodied intelligence integrates multiple modalities, enabling agents to understand images, language, and actions simultaneously. However, existing models always depend on additional datasets or extensive pre-training to maximize performance improvements, consuming abundant training time and expensive hardware cost. To tackle this issue, we present RoboBERT, a novel end-to-end robotic manipulation model integrated with a unique training strategy. This model utilizes a CNN-based diffusion policy, enhancing and stabilizing the effectiveness of this model by separating training processes for different modalities. It also underscores the importance of data augmentation, verifying various techniques to significantly boost performance. Unlike models that depend on extra data or large foundation models, RoboBERT achieves a highly competitive success rate while using only language-labeled expert demonstrations and maintaining a relatively smaller model size. Specifically, RoboBERT achieves an average length of 4.52 on the CALVIN benchmark for $ABCD \rightarrow D$ task, setting a new state-of-the-art (SOTA) record. Furthermore, when tested on a real robot, the model demonstrates superior performance, achieving a higher success rate than other methods trained with the same data. We propose that these concepts and methodologies of RoboBERT demonstrate extensive versatility and compatibility, contributing significantly to the development of lightweight multimodal robotic models. The code can be accessed on https://github.com/PeterWangsicheng/RoboBERT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07837v1-abstract-full').style.display = 'none'; document.getElementById('2502.07837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07001">arXiv:2502.07001</a> <span> [<a href="https://arxiv.org/pdf/2502.07001">pdf</a>, <a href="https://arxiv.org/format/2502.07001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Image to Video: An Empirical Study of Diffusion Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=V%C3%A9lez%2C+P">Pedro V茅lez</a>, <a href="/search/cs?searchtype=author&query=Polan%C3%ADa%2C+L+F">Luisa F. Polan铆a</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Kabra%2C+R">Rishab Kabra</a>, <a href="/search/cs?searchtype=author&query=Arnab%2C+A">Anurag Arnab</a>, <a href="/search/cs?searchtype=author&query=Sajjadi%2C+M+S+M">Mehdi S. M. Sajjadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07001v1-abstract-short" style="display: inline;"> Diffusion models have revolutionized generative modeling, enabling unprecedented realism in image and video synthesis. This success has sparked interest in leveraging their representations for visual understanding tasks. While recent works have explored this potential for image generation, the visual understanding capabilities of video diffusion models remain largely uncharted. To address this gap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07001v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07001v1-abstract-full" style="display: none;"> Diffusion models have revolutionized generative modeling, enabling unprecedented realism in image and video synthesis. This success has sparked interest in leveraging their representations for visual understanding tasks. While recent works have explored this potential for image generation, the visual understanding capabilities of video diffusion models remain largely uncharted. To address this gap, we systematically compare the same model architecture trained for video versus image generation, analyzing the performance of their latent representations on various downstream tasks including image classification, action recognition, depth estimation, and tracking. Results show that video diffusion models consistently outperform their image counterparts, though we find a striking range in the extent of this superiority. We further analyze features extracted from different layers and with varying noise levels, as well as the effect of model size and training budget on representation and generation quality. This work marks the first direct comparison of video and image diffusion objectives for visual understanding, offering insights into the role of temporal information in representation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07001v1-abstract-full').style.display = 'none'; document.getElementById('2502.07001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06710">arXiv:2502.06710</a> <span> [<a href="https://arxiv.org/pdf/2502.06710">pdf</a>, <a href="https://arxiv.org/format/2502.06710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning Musical Representations for Music Performance Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+X">Xingjian Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhongyu Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiyi Wu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06710v1-abstract-short" style="display: inline;"> Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities in general scenarios, they are incapable of dealing with fundamental problems within the music performances:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06710v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06710v1-abstract-full" style="display: none;"> Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities in general scenarios, they are incapable of dealing with fundamental problems within the music performances: they underexplore the interaction between the multimodal signals in performance and fail to consider the distinctive characteristics of instruments and music. Therefore, existing methods tend to answer questions regarding musical performances inaccurately. To bridge the above research gaps, (i) given the intricate multimodal interconnectivity inherent to music data, our primary backbone is designed to incorporate multimodal interactions within the context of music; (ii) to enable the model to learn music characteristics, we annotate and release rhythmic and music sources in the current music datasets; (iii) for time-aware audio-visual modeling, we align the model's music predictions with the temporal dimension. Our experiments show state-of-the-art effects on the Music AVQA datasets. Our code is available at https://github.com/xid32/Amuse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06710v1-abstract-full').style.display = 'none'; document.getElementById('2502.06710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06589">arXiv:2502.06589</a> <span> [<a href="https://arxiv.org/pdf/2502.06589">pdf</a>, <a href="https://arxiv.org/format/2502.06589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hephaestus: Improving Fundamental Agent Capabilities of Large Language Models through Continual Pre-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yuchen Zhuang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingfeng Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haoming Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Kewei Cheng</a>, <a href="/search/cs?searchtype=author&query=Lokegaonkar%2C+S">Sanket Lokegaonkar</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/cs?searchtype=author&query=Ping%2C+Q">Qing Ping</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Binxuan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pei Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruijie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rongzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zalmout%2C+N">Nasser Zalmout</a>, <a href="/search/cs?searchtype=author&query=Nigam%2C+P">Priyanka Nigam</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Bing Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06589v1-abstract-short" style="display: inline;"> Due to the scarcity of agent-oriented pre-training data, LLM-based autonomous agents typically rely on complex prompting or extensive fine-tuning, which often fails to introduce new capabilities while preserving strong generalizability. We introduce Hephaestus-Forge, the first large-scale pre-training corpus designed to enhance the fundamental capabilities of LLM agents in API function calling, in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06589v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06589v1-abstract-full" style="display: none;"> Due to the scarcity of agent-oriented pre-training data, LLM-based autonomous agents typically rely on complex prompting or extensive fine-tuning, which often fails to introduce new capabilities while preserving strong generalizability. We introduce Hephaestus-Forge, the first large-scale pre-training corpus designed to enhance the fundamental capabilities of LLM agents in API function calling, intrinsic reasoning and planning, and adapting to environmental feedback. Hephaestus-Forge comprises 103B agent-specific data encompassing 76,537 APIs, including both tool documentation to introduce knowledge of API functions and function calling trajectories to strengthen intrinsic reasoning. To explore effective training protocols, we investigate scaling laws to identify the optimal recipe in data mixing ratios. By continual pre-training on Hephaestus-Forge, Hephaestus outperforms small- to medium-scale open-source LLMs and rivals commercial LLMs on three agent benchmarks, demonstrating the effectiveness of our pre-training corpus in enhancing fundamental agentic capabilities and generalization of LLMs to new tasks or environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06589v1-abstract-full').style.display = 'none'; document.getElementById('2502.06589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06586">arXiv:2502.06586</a> <span> [<a href="https://arxiv.org/pdf/2502.06586">pdf</a>, <a href="https://arxiv.org/ps/2502.06586">ps</a>, <a href="https://arxiv.org/format/2502.06586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> Decay of correlation for edge colorings when $q>3螖$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zejia Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06586v1-abstract-short" style="display: inline;"> We examine various perspectives on the decay of correlation for the uniform distribution over proper $q$-edge colorings of graphs with maximum degree $螖$. First, we establish the coupling independence property when $q\ge 3螖$ for general graphs. Together with the work of Chen et al. (2024), this result implies a fully polynomial-time approximation scheme (FPTAS) for counting the number of proper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06586v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06586v1-abstract-full" style="display: none;"> We examine various perspectives on the decay of correlation for the uniform distribution over proper $q$-edge colorings of graphs with maximum degree $螖$. First, we establish the coupling independence property when $q\ge 3螖$ for general graphs. Together with the work of Chen et al. (2024), this result implies a fully polynomial-time approximation scheme (FPTAS) for counting the number of proper $q$-edge colorings. Next, we prove the strong spatial mixing property on trees, provided that $q> (3+o(1))螖$. The strong spatial mixing property is derived from the spectral independence property of a version of the weighted edge coloring distribution, which is established using the matrix trickle-down method developed in Abdolazimi, Liu and Oveis Gharan (FOCS, 2021) and Wang, Zhang and Zhang (STOC, 2024). Finally, we show that the weak spatial mixing property holds on trees with maximum degree $螖$ if and only if $q\ge 2螖-1$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06586v1-abstract-full').style.display = 'none'; document.getElementById('2502.06586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06453">arXiv:2502.06453</a> <span> [<a href="https://arxiv.org/pdf/2502.06453">pdf</a>, <a href="https://arxiv.org/format/2502.06453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiacheng Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiang Ji</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yingqing Guo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianle Cai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06453v2-abstract-short" style="display: inline;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06453v2-abstract-full" style="display: none;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underlying reasoning patterns of the solutions. However, no work has explored hard perturbations, which fundamentally change the nature of the problem so that the original solution steps do not apply. To bridge the gap, we construct MATH-P-Simple and MATH-P-Hard via simple perturbation and hard perturbation, respectively. Each consists of 279 perturbed math problems derived from level-5 (hardest) problems in the MATH dataset (Hendrycksmath et. al., 2021). We observe significant performance drops on MATH-P-Hard across various models, including o1-mini (-16.49%) and gemini-2.0-flash-thinking (-12.9%). We also raise concerns about a novel form of memorization where models blindly apply learned problem-solving skills without assessing their applicability to modified contexts. This issue is amplified when using original problems for in-context learning. We call for research efforts to address this challenge, which is critical for developing more robust and reliable reasoning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'none'; document.getElementById('2502.06453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2: fix bugs in Fig. 1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06452">arXiv:2502.06452</a> <span> [<a href="https://arxiv.org/pdf/2502.06452">pdf</a>, <a href="https://arxiv.org/format/2502.06452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> SparseFocus: Learning-based One-shot Autofocus for Microscopy with Sparse Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yongping Zhai</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiaoxi Fu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Q">Qiang Su</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jia Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yake Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunfeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dongdong Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shen Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06452v1-abstract-short" style="display: inline;"> Autofocus is necessary for high-throughput and real-time scanning in microscopic imaging. Traditional methods rely on complex hardware or iterative hill-climbing algorithms. Recent learning-based approaches have demonstrated remarkable efficacy in a one-shot setting, avoiding hardware modifications or iterative mechanical lens adjustments. However, in this paper, we highlight a significant challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06452v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06452v1-abstract-full" style="display: none;"> Autofocus is necessary for high-throughput and real-time scanning in microscopic imaging. Traditional methods rely on complex hardware or iterative hill-climbing algorithms. Recent learning-based approaches have demonstrated remarkable efficacy in a one-shot setting, avoiding hardware modifications or iterative mechanical lens adjustments. However, in this paper, we highlight a significant challenge that the richness of image content can significantly affect autofocus performance. When the image content is sparse, previous autofocus methods, whether traditional climbing-hill or learning-based, tend to fail. To tackle this, we propose a content-importance-based solution, named SparseFocus, featuring a novel two-stage pipeline. The first stage measures the importance of regions within the image, while the second stage calculates the defocus distance from selected important regions. To validate our approach and benefit the research community, we collect a large-scale dataset comprising millions of labelled defocused images, encompassing both dense, sparse and extremely sparse scenarios. Experimental results show that SparseFocus surpasses existing methods, effectively handling all levels of content sparsity. Moreover, we integrate SparseFocus into our Whole Slide Imaging (WSI) system that performs well in real-world applications. The code and dataset will be made available upon the publication of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06452v1-abstract-full').style.display = 'none'; document.getElementById('2502.06452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06200">arXiv:2502.06200</a> <span> [<a href="https://arxiv.org/pdf/2502.06200">pdf</a>, <a href="https://arxiv.org/format/2502.06200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the query complexity of sampling from non-log-concave distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuchen He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chihao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06200v2-abstract-short" style="display: inline;"> We study the problem of sampling from a $d$-dimensional distribution with density $p(x)\propto e^{-f(x)}$, which does not necessarily satisfy good isoperimetric conditions. Specifically, we show that for any $L,M$ satisfying $LM\ge d\ge 5$, $蔚\in \left(0,\frac{1}{32}\right)$, and any algorithm with query accesses to the value of $f(x)$ and $\nabla f(x)$, there exists an $L$-log-smooth distributi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06200v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06200v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06200v2-abstract-full" style="display: none;"> We study the problem of sampling from a $d$-dimensional distribution with density $p(x)\propto e^{-f(x)}$, which does not necessarily satisfy good isoperimetric conditions. Specifically, we show that for any $L,M$ satisfying $LM\ge d\ge 5$, $蔚\in \left(0,\frac{1}{32}\right)$, and any algorithm with query accesses to the value of $f(x)$ and $\nabla f(x)$, there exists an $L$-log-smooth distribution with second moment at most $M$ such that the algorithm requires $\left(\frac{LM}{d蔚}\right)^{惟(d)}$ queries to compute a sample whose distribution is within $蔚$ in total variation distance to the target distribution. We complement the lower bound with an algorithm requiring $\left(\frac{LM}{d蔚}\right)^{\mathcal O(d)}$ queries, thereby characterizing the tight (up to the constant in the exponent) query complexity for sampling from the family of non-log-concave distributions. Our results are in sharp contrast with the recent work of Huang et al. (COLT'24), where an algorithm with quasi-polynomial query complexity was proposed for sampling from a non-log-concave distribution when $M=\mathtt{poly}(d)$. Their algorithm works under the stronger condition that all distributions along the trajectory of the Ornstein-Uhlenbeck process, starting from the target distribution, are $\mathcal O(1)$-log-smooth. We investigate this condition and prove that it is strictly stronger than requiring the target distribution to be $\mathcal O(1)$-log-smooth. Additionally, we study this condition in the context of mixtures of Gaussians. Finally, we place our results within the broader theme of ``sampling versus optimization'', as studied in Ma et al. (PNAS'19). We show that for a wide range of parameters, sampling is strictly easier than optimization by a super-exponential factor in the dimension $d$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06200v2-abstract-full').style.display = 'none'; document.getElementById('2502.06200v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06195">arXiv:2502.06195</a> <span> [<a href="https://arxiv.org/pdf/2502.06195">pdf</a>, <a href="https://arxiv.org/format/2502.06195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Calibration of Multiple Asynchronous Microphone Arrays using Hybrid TDOA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+W">Wenda Pan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xinyang Han</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">He Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06195v1-abstract-short" style="display: inline;"> Accurate calibration of acoustic sensing systems made of multiple asynchronous microphone arrays is essential for satisfactory performance in sound source localization and tracking. State-of-the-art calibration methods for this type of system rely on the time difference of arrival and direction of arrival measurements among the microphone arrays (denoted as TDOA-M and DOA, respectively). In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06195v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06195v1-abstract-full" style="display: none;"> Accurate calibration of acoustic sensing systems made of multiple asynchronous microphone arrays is essential for satisfactory performance in sound source localization and tracking. State-of-the-art calibration methods for this type of system rely on the time difference of arrival and direction of arrival measurements among the microphone arrays (denoted as TDOA-M and DOA, respectively). In this paper, to enhance calibration accuracy, we propose to incorporate the time difference of arrival measurements between adjacent sound events (TDOAS) with respect to the microphone arrays. More specifically, we propose a two-stage calibration approach, including an initial value estimation (IVE) procedure and the final joint optimization step. The IVE stage first initializes all parameters except for microphone array orientations, using hybrid TDOA (i.e., TDOAM and TDOA-S), odometer data from a moving robot carrying a speaker, and DOA. Subsequently, microphone orientations are estimated through the iterative closest point method. The final joint optimization step estimates multiple microphone array locations, orientations, time offsets, clock drift rates, and sound source locations simultaneously. Both simulation and experiment results show that for scenarios with low or moderate TDOA noise levels, our approach outperforms existing methods in terms of accuracy. All code and data are available at https://github.com/AISLABsustech/Hybrid-TDOA-Multi-Calib. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06195v1-abstract-full').style.display = 'none'; document.getElementById('2502.06195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted and is going to be presented at ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06130">arXiv:2502.06130</a> <span> [<a href="https://arxiv.org/pdf/2502.06130">pdf</a>, <a href="https://arxiv.org/format/2502.06130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Correcting Decoding with Generative Feedback for Mitigating Hallucinations in Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Ce Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zifu Wan</a>, <a href="/search/cs?searchtype=author&query=Kan%2C+Z">Zhehan Kan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M+Q">Martin Q. Ma</a>, <a href="/search/cs?searchtype=author&query=Stepputtis%2C+S">Simon Stepputtis</a>, <a href="/search/cs?searchtype=author&query=Ramanan%2C+D">Deva Ramanan</a>, <a href="/search/cs?searchtype=author&query=Salakhutdinov%2C+R">Russ Salakhutdinov</a>, <a href="/search/cs?searchtype=author&query=Morency%2C+L">Louis-Philippe Morency</a>, <a href="/search/cs?searchtype=author&query=Sycara%2C+K">Katia Sycara</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yaqi Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06130v1-abstract-short" style="display: inline;"> While recent Large Vision-Language Models (LVLMs) have shown remarkable performance in multi-modal tasks, they are prone to generating hallucinatory text responses that do not align with the given visual input, which restricts their practical applicability in real-world scenarios. In this work, inspired by the observation that the text-to-image generation process is the inverse of image-conditione… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06130v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06130v1-abstract-full" style="display: none;"> While recent Large Vision-Language Models (LVLMs) have shown remarkable performance in multi-modal tasks, they are prone to generating hallucinatory text responses that do not align with the given visual input, which restricts their practical applicability in real-world scenarios. In this work, inspired by the observation that the text-to-image generation process is the inverse of image-conditioned response generation in LVLMs, we explore the potential of leveraging text-to-image generative models to assist in mitigating hallucinations in LVLMs. We discover that generative models can offer valuable self-feedback for mitigating hallucinations at both the response and token levels. Building on this insight, we introduce self-correcting Decoding with Generative Feedback (DeGF), a novel training-free algorithm that incorporates feedback from text-to-image generative models into the decoding process to effectively mitigate hallucinations in LVLMs. Specifically, DeGF generates an image from the initial response produced by LVLMs, which acts as an auxiliary visual reference and provides self-feedback to verify and correct the initial response through complementary or contrastive decoding. Extensive experimental results validate the effectiveness of our approach in mitigating diverse types of hallucinations, consistently surpassing state-of-the-art methods across six benchmarks. Code is available at https://github.com/zhangce01/DeGF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06130v1-abstract-full').style.display = 'none'; document.getElementById('2502.06130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025. Project page:https://zhangce01.github.io/DeGF/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06020">arXiv:2502.06020</a> <span> [<a href="https://arxiv.org/pdf/2502.06020">pdf</a>, <a href="https://arxiv.org/format/2502.06020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Temporal Working Memory: Query-Guided Segment Refinement for Enhanced Multimodal Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+X">Xingjian Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiyi Wu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhongyu Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+P">Peijun Qing</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Vosoughi%2C+S">Soroush Vosoughi</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06020v1-abstract-short" style="display: inline;"> Multimodal foundation models (MFMs) have demonstrated significant success in tasks such as visual captioning, question answering, and image-text retrieval. However, these models face inherent limitations due to their finite internal capacity, which restricts their ability to process extended temporal sequences, a crucial requirement for comprehensive video and audio analysis. To overcome these cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06020v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06020v1-abstract-full" style="display: none;"> Multimodal foundation models (MFMs) have demonstrated significant success in tasks such as visual captioning, question answering, and image-text retrieval. However, these models face inherent limitations due to their finite internal capacity, which restricts their ability to process extended temporal sequences, a crucial requirement for comprehensive video and audio analysis. To overcome these challenges, we introduce a specialized cognitive module, temporal working memory (TWM), which aims to enhance the temporal modeling capabilities of MFMs. It selectively retains task-relevant information across temporal dimensions, ensuring that critical details are preserved throughout the processing of video and audio content. The TWM uses a query-guided attention approach to focus on the most informative multimodal segments within temporal sequences. By retaining only the most relevant content, TWM optimizes the use of the model's limited capacity, enhancing its temporal modeling ability. This plug-and-play module can be easily integrated into existing MFMs. With our TWM, nine state-of-the-art models exhibit significant performance improvements across tasks such as video captioning, question answering, and video-text retrieval. By enhancing temporal modeling, TWM extends the capability of MFMs to handle complex, time-sensitive data effectively. Our code is available at https://github.com/xid32/NAACL_2025_TWM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06020v1-abstract-full').style.display = 'none'; document.getElementById('2502.06020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NAACL 2025</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository