CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 791 results for author: <span class="mathjax">Yu, W</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yu%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yu, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yu%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yu, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yu%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23888">arXiv:2503.23888</a> <span> [<a href="https://arxiv.org/pdf/2503.23888">pdf</a>, <a href="https://arxiv.org/format/2503.23888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MuseFace: Text-driven Face Editing via Diffusion-based Mask Generation Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siting Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiangyang Luo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weijiang Yu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Heng Chang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23888v1-abstract-short" style="display: inline;"> Face editing modifies the appearance of face, which plays a key role in customization and enhancement of personal images. Although much work have achieved remarkable success in text-driven face editing, they still face significant challenges as none of them simultaneously fulfill the characteristics of diversity, controllability and flexibility. To address this challenge, we propose MuseFace, a te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23888v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23888v1-abstract-full" style="display: none;"> Face editing modifies the appearance of face, which plays a key role in customization and enhancement of personal images. Although much work have achieved remarkable success in text-driven face editing, they still face significant challenges as none of them simultaneously fulfill the characteristics of diversity, controllability and flexibility. To address this challenge, we propose MuseFace, a text-driven face editing framework, which relies solely on text prompt to enable face editing. Specifically, MuseFace integrates a Text-to-Mask diffusion model and a semantic-aware face editing model, capable of directly generating fine-grained semantic masks from text and performing face editing. The Text-to-Mask diffusion model provides \textit{diversity} and \textit{flexibility} to the framework, while the semantic-aware face editing model ensures \textit{controllability} of the framework. Our framework can create fine-grained semantic masks, making precise face editing possible, and significantly enhancing the controllability and flexibility of face editing models. Extensive experiments demonstrate that MuseFace achieves superior high-fidelity performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23888v1-abstract-full').style.display = 'none'; document.getElementById('2503.23888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures,IEEE International Conference on Multimedia & Expo 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23434">arXiv:2503.23434</a> <span> [<a href="https://arxiv.org/pdf/2503.23434">pdf</a>, <a href="https://arxiv.org/format/2503.23434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Trustworthy GUI Agents: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yucheng Shi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wenlin Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23434v1-abstract-short" style="display: inline;"> GUI agents, powered by large foundation models, can interact with digital interfaces, enabling various applications in web automation, mobile navigation, and software testing. However, their increasing autonomy has raised critical concerns about their security, privacy, and safety. This survey examines the trustworthiness of GUI agents in five critical dimensions: security vulnerabilities, reliabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23434v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23434v1-abstract-full" style="display: none;"> GUI agents, powered by large foundation models, can interact with digital interfaces, enabling various applications in web automation, mobile navigation, and software testing. However, their increasing autonomy has raised critical concerns about their security, privacy, and safety. This survey examines the trustworthiness of GUI agents in five critical dimensions: security vulnerabilities, reliability in dynamic environments, transparency and explainability, ethical considerations, and evaluation methodologies. We also identify major challenges such as vulnerability to adversarial attacks, cascading failure modes in sequential decision-making, and a lack of realistic evaluation benchmarks. These issues not only hinder real-world deployment but also call for comprehensive mitigation strategies beyond task success. As GUI agents become more widespread, establishing robust safety standards and responsible development practices is essential. This survey provides a foundation for advancing trustworthy GUI agents through systematic understanding and future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23434v1-abstract-full').style.display = 'none'; document.getElementById('2503.23434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, work in process</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23162">arXiv:2503.23162</a> <span> [<a href="https://arxiv.org/pdf/2503.23162">pdf</a>, <a href="https://arxiv.org/format/2503.23162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeuralGS: Bridging Neural Fields and 3D Gaussian Splatting for Compact 3D Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenyu Tang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chaoran Feng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wangbo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junwu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Long%2C+X">Xiaoxiao Long</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23162v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) demonstrates superior quality and rendering speed, but with millions of 3D Gaussians and significant storage and transmission costs. Recent 3DGS compression methods mainly concentrate on compressing Scaffold-GS, achieving impressive performance but with an additional voxel structure and a complex encoding and quantization strategy. In this paper, we aim to develop a si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23162v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23162v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) demonstrates superior quality and rendering speed, but with millions of 3D Gaussians and significant storage and transmission costs. Recent 3DGS compression methods mainly concentrate on compressing Scaffold-GS, achieving impressive performance but with an additional voxel structure and a complex encoding and quantization strategy. In this paper, we aim to develop a simple yet effective method called NeuralGS that explores in another way to compress the original 3DGS into a compact representation without the voxel structure and complex quantization strategies. Our observation is that neural fields like NeRF can represent complex 3D scenes with Multi-Layer Perceptron (MLP) neural networks using only a few megabytes. Thus, NeuralGS effectively adopts the neural field representation to encode the attributes of 3D Gaussians with MLPs, only requiring a small storage size even for a large-scale scene. To achieve this, we adopt a clustering strategy and fit the Gaussians with different tiny MLPs for each cluster, based on importance scores of Gaussians as fitting weights. We experiment on multiple datasets, achieving a 45-times average model size reduction without harming the visual quality. The compression performance of our method on original 3DGS is comparable to the dedicated Scaffold-GS-based compression methods, which demonstrate the huge potential of directly compressing original 3DGS with neural fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23162v1-abstract-full').style.display = 'none'; document.getElementById('2503.23162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://pku-yuangroup.github.io/NeuralGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22091">arXiv:2503.22091</a> <span> [<a href="https://arxiv.org/pdf/2503.22091">pdf</a>, <a href="https://arxiv.org/format/2503.22091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> A Graph-native Optimization Framework for Complex Graph Queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bingqing Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaoli Zhou</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+L">Longbin Lai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yufan Yang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Y">Yunkai Lou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22091v1-abstract-short" style="display: inline;"> This technical report extends the SIGMOD 2025 paper "A Modular Graph-Native Query Optimization Framework" by providing a comprehensive exposition of GOpt's advanced technical mechanisms, implementation strategies, and extended evaluations. While the original paper introduced GOpt's unified intermediate representation (GIR) and demonstrated its performance benefits, this report delves into the fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22091v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22091v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22091v1-abstract-full" style="display: none;"> This technical report extends the SIGMOD 2025 paper "A Modular Graph-Native Query Optimization Framework" by providing a comprehensive exposition of GOpt's advanced technical mechanisms, implementation strategies, and extended evaluations. While the original paper introduced GOpt's unified intermediate representation (GIR) and demonstrated its performance benefits, this report delves into the framework's implementation depth: (1) the full specification of GOpt's optimization rules; (2) a systematic treatment of semantic variations (e.g., homomorphism vs. edge-distinct matching) across query languages and their implications for optimization; (3) the design of GOpt's Physical integration interface, enabling seamless integration with transactional (Neo4j) and distributed (GraphScope) backends via engine-specific operator customization; and (4) a detailed analysis of plan transformations for LDBC benchmark queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22091v1-abstract-full').style.display = 'none'; document.getElementById('2503.22091v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21779">arXiv:2503.21779</a> <span> [<a href="https://arxiv.org/pdf/2503.21779">pdf</a>, <a href="https://arxiv.org/format/2503.21779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X$^{2}$-Gaussian: 4D Radiative Gaussian Splatting for Continuous-time Tomographic Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+R">Ruyi Zha</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiwen Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21779v1-abstract-short" style="display: inline;"> Four-dimensional computed tomography (4D CT) reconstruction is crucial for capturing dynamic anatomical changes but faces inherent limitations from conventional phase-binning workflows. Current methods discretize temporal resolution into fixed phases with respiratory gating devices, introducing motion misalignment and restricting clinical practicality. In this paper, We propose X$^2$-Gaussian, a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21779v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21779v1-abstract-full" style="display: none;"> Four-dimensional computed tomography (4D CT) reconstruction is crucial for capturing dynamic anatomical changes but faces inherent limitations from conventional phase-binning workflows. Current methods discretize temporal resolution into fixed phases with respiratory gating devices, introducing motion misalignment and restricting clinical practicality. In this paper, We propose X$^2$-Gaussian, a novel framework that enables continuous-time 4D-CT reconstruction by integrating dynamic radiative Gaussian splatting with self-supervised respiratory motion learning. Our approach models anatomical dynamics through a spatiotemporal encoder-decoder architecture that predicts time-varying Gaussian deformations, eliminating phase discretization. To remove dependency on external gating devices, we introduce a physiology-driven periodic consistency loss that learns patient-specific breathing cycles directly from projections via differentiable optimization. Extensive experiments demonstrate state-of-the-art performance, achieving a 9.93 dB PSNR gain over traditional methods and 2.25 dB improvement against prior Gaussian splatting techniques. By unifying continuous motion modeling with hardware-free period learning, X$^2$-Gaussian advances high-fidelity 4D CT reconstruction for dynamic clinical imaging. Project website at: https://x2-gaussian.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21779v1-abstract-full').style.display = 'none'; document.getElementById('2503.21779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://x2-gaussian.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20314">arXiv:2503.20314</a> <span> [<a href="https://arxiv.org/pdf/2503.20314">pdf</a>, <a href="https://arxiv.org/format/2503.20314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Wan: Open and Advanced Large-Scale Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=WanTeam"> WanTeam</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Baole Ai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bin Wen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chen-Wei Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feiwu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiming Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jianyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Keyu Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianghua Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengyang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pandeng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pingyu Wu</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20314v1-abstract-short" style="display: inline;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20314v1-abstract-full" style="display: none;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluation metrics. These contributions collectively enhance the model's performance and versatility. Specifically, Wan is characterized by four key features: Leading Performance: The 14B model of Wan, trained on a vast dataset comprising billions of images and videos, demonstrates the scaling laws of video generation with respect to both data and model size. It consistently outperforms the existing open-source models as well as state-of-the-art commercial solutions across multiple internal and external benchmarks, demonstrating a clear and significant performance superiority. Comprehensiveness: Wan offers two capable models, i.e., 1.3B and 14B parameters, for efficiency and effectiveness respectively. It also covers multiple downstream applications, including image-to-video, instruction-guided video editing, and personal video generation, encompassing up to eight tasks. Consumer-Grade Efficiency: The 1.3B model demonstrates exceptional resource efficiency, requiring only 8.19 GB VRAM, making it compatible with a wide range of consumer-grade GPUs. Openness: We open-source the entire series of Wan, including source code and all models, with the goal of fostering the growth of the video generation community. This openness seeks to significantly expand the creative possibilities of video production in the industry and provide academia with high-quality video foundation models. All the code and models are available at https://github.com/Wan-Video/Wan2.1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'none'; document.getElementById('2503.20314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 pages, 33 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20290">arXiv:2503.20290</a> <span> [<a href="https://arxiv.org/pdf/2503.20290">pdf</a>, <a href="https://arxiv.org/format/2503.20290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> QualiSpeech: A Speech Quality Assessment Dataset with Natural Language Reasoning and Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyin Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyi Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianzhao Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xiaohai Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a>, <a href="/search/cs?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20290v1-abstract-short" style="display: inline;"> This paper explores a novel perspective to speech quality assessment by leveraging natural language descriptions, offering richer, more nuanced insights than traditional numerical scoring methods. Natural language feedback provides instructive recommendations and detailed evaluations, yet existing datasets lack the comprehensive annotations needed for this approach. To bridge this gap, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20290v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20290v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20290v1-abstract-full" style="display: none;"> This paper explores a novel perspective to speech quality assessment by leveraging natural language descriptions, offering richer, more nuanced insights than traditional numerical scoring methods. Natural language feedback provides instructive recommendations and detailed evaluations, yet existing datasets lack the comprehensive annotations needed for this approach. To bridge this gap, we introduce QualiSpeech, a comprehensive low-level speech quality assessment dataset encompassing 11 key aspects and detailed natural language comments that include reasoning and contextual insights. Additionally, we propose the QualiSpeech Benchmark to evaluate the low-level speech understanding capabilities of auditory large language models (LLMs). Experimental results demonstrate that finetuned auditory LLMs can reliably generate detailed descriptions of noise and distortion, effectively identifying their types and temporal characteristics. The results further highlight the potential for incorporating reasoning to enhance the accuracy and reliability of quality assessments. The dataset will be released at https://huggingface.co/datasets/tsinghua-ee/QualiSpeech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20290v1-abstract-full').style.display = 'none'; document.getElementById('2503.20290v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20020">arXiv:2503.20020</a> <span> [<a href="https://arxiv.org/pdf/2503.20020">pdf</a>, <a href="https://arxiv.org/format/2503.20020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Gemini Robotics: Bringing AI into the Physical World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Robotics+Team"> Gemini Robotics Team</a>, <a href="/search/cs?searchtype=author&query=Abeyruwan%2C+S">Saminda Abeyruwan</a>, <a href="/search/cs?searchtype=author&query=Ainslie%2C+J">Joshua Ainslie</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Arenas%2C+M+G">Montserrat Gonzalez Arenas</a>, <a href="/search/cs?searchtype=author&query=Armstrong%2C+T">Travis Armstrong</a>, <a href="/search/cs?searchtype=author&query=Balakrishna%2C+A">Ashwin Balakrishna</a>, <a href="/search/cs?searchtype=author&query=Baruch%2C+R">Robert Baruch</a>, <a href="/search/cs?searchtype=author&query=Bauza%2C+M">Maria Bauza</a>, <a href="/search/cs?searchtype=author&query=Blokzijl%2C+M">Michiel Blokzijl</a>, <a href="/search/cs?searchtype=author&query=Bohez%2C+S">Steven Bohez</a>, <a href="/search/cs?searchtype=author&query=Bousmalis%2C+K">Konstantinos Bousmalis</a>, <a href="/search/cs?searchtype=author&query=Brohan%2C+A">Anthony Brohan</a>, <a href="/search/cs?searchtype=author&query=Buschmann%2C+T">Thomas Buschmann</a>, <a href="/search/cs?searchtype=author&query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&query=Cabi%2C+S">Serkan Cabi</a>, <a href="/search/cs?searchtype=author&query=Caluwaerts%2C+K">Ken Caluwaerts</a>, <a href="/search/cs?searchtype=author&query=Casarini%2C+F">Federico Casarini</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+O">Oscar Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+E">Jose Enrique Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+H+L">Hao-Tien Lewis Chiang</a>, <a href="/search/cs?searchtype=author&query=Choromanski%2C+K">Krzysztof Choromanski</a>, <a href="/search/cs?searchtype=author&query=D%27Ambrosio%2C+D">David D'Ambrosio</a>, <a href="/search/cs?searchtype=author&query=Dasari%2C+S">Sudeep Dasari</a> , et al. (93 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20020v1-abstract-short" style="display: inline;"> Recent advancements in large multimodal models have led to the emergence of remarkable generalist capabilities in digital domains, yet their translation to physical agents such as robots remains a significant challenge. This report introduces a new family of AI models purposefully designed for robotics and built upon the foundation of Gemini 2.0. We present Gemini Robotics, an advanced Vision-Lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20020v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20020v1-abstract-full" style="display: none;"> Recent advancements in large multimodal models have led to the emergence of remarkable generalist capabilities in digital domains, yet their translation to physical agents such as robots remains a significant challenge. This report introduces a new family of AI models purposefully designed for robotics and built upon the foundation of Gemini 2.0. We present Gemini Robotics, an advanced Vision-Language-Action (VLA) generalist model capable of directly controlling robots. Gemini Robotics executes smooth and reactive movements to tackle a wide range of complex manipulation tasks while also being robust to variations in object types and positions, handling unseen environments as well as following diverse, open vocabulary instructions. We show that with additional fine-tuning, Gemini Robotics can be specialized to new capabilities including solving long-horizon, highly dexterous tasks, learning new short-horizon tasks from as few as 100 demonstrations and adapting to completely novel robot embodiments. This is made possible because Gemini Robotics builds on top of the Gemini Robotics-ER model, the second model we introduce in this work. Gemini Robotics-ER (Embodied Reasoning) extends Gemini's multimodal reasoning capabilities into the physical world, with enhanced spatial and temporal understanding. This enables capabilities relevant to robotics including object detection, pointing, trajectory and grasp prediction, as well as multi-view correspondence and 3D bounding box predictions. We show how this novel combination can support a variety of robotics applications. We also discuss and address important safety considerations related to this new class of robotics foundation models. The Gemini Robotics family marks a substantial step towards developing general-purpose robots that realizes AI's potential in the physical world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20020v1-abstract-full').style.display = 'none'; document.getElementById('2503.20020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19506">arXiv:2503.19506</a> <span> [<a href="https://arxiv.org/pdf/2503.19506">pdf</a>, <a href="https://arxiv.org/format/2503.19506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIV.2024.3414852">10.1109/TIV.2024.3414852 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MM-LINS: a Multi-Map LiDAR-Inertial System for Over-Degenerate Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yongxin Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+T">Tian Zhi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenlu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lihua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19506v1-abstract-short" style="display: inline;"> SLAM plays a crucial role in automation tasks, such as warehouse logistics, healthcare robotics, and restaurant delivery. These scenes come with various challenges, including navigating around crowds of people, dealing with flying plastic bags that can temporarily blind sensors, and addressing reduced LiDAR density caused by cooking smoke. Such scenarios can result in over-degeneracy, causing the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19506v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19506v1-abstract-full" style="display: none;"> SLAM plays a crucial role in automation tasks, such as warehouse logistics, healthcare robotics, and restaurant delivery. These scenes come with various challenges, including navigating around crowds of people, dealing with flying plastic bags that can temporarily blind sensors, and addressing reduced LiDAR density caused by cooking smoke. Such scenarios can result in over-degeneracy, causing the map to drift. To address this issue, this paper presents a multi-map LiDAR-inertial system (MM-LINS) for the first time. The front-end employs an iterated error state Kalman filter for state estimation and introduces a reliable evaluation strategy for degeneracy detection. If over-degeneracy is detected, the active map will be stored into sleeping maps. Subsequently, the system continuously attempts to construct new maps using a dynamic initialization method to ensure successful initialization upon leaving the over-degeneracy. Regarding the back-end, the Scan Context descriptor is utilized to detect inter-map similarity. Upon successful recognition of a sleeping map that shares a common region with the active map, the overlapping trajectory region is utilized to constrain the positional transformation near the edge of the prior map. In response to this, a constraint-enhanced map fusion strategy is proposed to achieve high-precision positional and mapping results. Experiments have been conducted separately on both public datasets that exhibited over-degenerate conditions and in real-world environments. These tests demonstrated the effectiveness of MM-LINS in over-degeneracy environment. Our codes are open-sourced on Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19506v1-abstract-full').style.display = 'none'; document.getElementById('2503.19506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Intelligent Vehicles</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19046">arXiv:2503.19046</a> <span> [<a href="https://arxiv.org/pdf/2503.19046">pdf</a>, <a href="https://arxiv.org/format/2503.19046">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Beamforming Codebooks for Active Sensing with Reconfigurable Intelligent Surface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongze Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19046v2-abstract-short" style="display: inline;"> This paper explores the design of beamforming codebooks for the base station (BS) and for the reconfigurable intelligent surfaces (RISs) in an active sensing scheme for uplink localization, in which the mobile user transmits a sequence of pilots to the BS through reflection at the RISs, and the BS and the RISs are adaptively configured by carefully choosing BS beamforming codeword and RIS codeword… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19046v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19046v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19046v2-abstract-full" style="display: none;"> This paper explores the design of beamforming codebooks for the base station (BS) and for the reconfigurable intelligent surfaces (RISs) in an active sensing scheme for uplink localization, in which the mobile user transmits a sequence of pilots to the BS through reflection at the RISs, and the BS and the RISs are adaptively configured by carefully choosing BS beamforming codeword and RIS codewords from their respective codebooks in a sequential manner to progressively focus onto the user. Most existing codebook designs for RIS are not tailored for active sensing, by which we mean the choice of the next codeword should depend on the measurements made so far, and the sequence of codewords should dynamically focus reflection toward the user. Moreover, most existing codeword selection methods rely on exhaustive search in beam training to identify the codeword with the highest signal-to-noise ratio (SNR), thus incurring substantial pilot overhead as the size of the codebook scales. This paper proposes a learning-based approach for codebook construction and for codeword selection for active sensing. The proposed learning approach aims to locate a target in the service area by recursively selecting a sequence of BS beamforming codewords and RIS codewords from the respective codebooks as more measurements become available without exhaustive beam training. The codebook design and the codeword selection fuse key ideas from the vector quantized variational autoencoder (VQ-VAE) and the long short-term memory (LSTM) network to learn respectively the discrete function space of the codebook and the temporal dependencies between measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19046v2-abstract-full').style.display = 'none'; document.getElementById('2503.19046v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in IEEE Transactions on Wireless Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18923">arXiv:2503.18923</a> <span> [<a href="https://arxiv.org/pdf/2503.18923">pdf</a>, <a href="https://arxiv.org/format/2503.18923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Video SimpleQA: Towards Factuality Evaluation in Large Video Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingyao Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jihao Gu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haoran Tang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoze Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jiahua Dong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wangbo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Reid%2C+I">Ian Reid</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18923v1-abstract-short" style="display: inline;"> Recent advancements in Large Video Language Models (LVLMs) have highlighted their potential for multi-modal understanding, yet evaluating their factual grounding in video contexts remains a critical unsolved challenge. To address this gap, we introduce Video SimpleQA, the first comprehensive benchmark tailored for factuality evaluation of LVLMs. Our work distinguishes from existing video benchmark… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18923v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18923v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18923v1-abstract-full" style="display: none;"> Recent advancements in Large Video Language Models (LVLMs) have highlighted their potential for multi-modal understanding, yet evaluating their factual grounding in video contexts remains a critical unsolved challenge. To address this gap, we introduce Video SimpleQA, the first comprehensive benchmark tailored for factuality evaluation of LVLMs. Our work distinguishes from existing video benchmarks through the following key features: 1) Knowledge required: demanding integration of external knowledge beyond the explicit narrative; 2) Fact-seeking question: targeting objective, undisputed events or relationships, avoiding subjective interpretation; 3) Definitive & short-form answer: Answers are crafted as unambiguous and definitively correct in a short format, enabling automated evaluation through LLM-as-a-judge frameworks with minimal scoring variance; 4) External-source verified: All annotations undergo rigorous validation against authoritative external references to ensure the reliability; 5) Temporal reasoning required: The annotated question types encompass both static single-frame understanding and dynamic temporal reasoning, explicitly evaluating LVLMs factuality under the long-context dependencies. We extensively evaluate 41 state-of-the-art LVLMs and summarize key findings as follows: 1) Current LVLMs exhibit notable deficiencies in factual adherence, particularly for open-source models. The best-performing model Gemini-1.5-Pro achieves merely an F-score of 54.4%; 2) Test-time compute paradigms show insignificant performance gains, revealing fundamental constraints for enhancing factuality through post-hoc computation; 3) Retrieval-Augmented Generation demonstrates consistent improvements at the cost of additional inference time overhead, presenting a critical efficiency-performance trade-off. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18923v1-abstract-full').style.display = 'none'; document.getElementById('2503.18923v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16976">arXiv:2503.16976</a> <span> [<a href="https://arxiv.org/pdf/2503.16976">pdf</a>, <a href="https://arxiv.org/format/2503.16976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GeoT: Geometry-guided Instance-dependent Transition Matrix for Semi-supervised Tooth Point Cloud Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaoqing Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16976v1-abstract-short" style="display: inline;"> Achieving meticulous segmentation of tooth point clouds from intra-oral scans stands as an indispensable prerequisite for various orthodontic applications. Given the labor-intensive nature of dental annotation, a significant amount of data remains unlabeled, driving increasing interest in semi-supervised approaches. One primary challenge of existing semi-supervised medical segmentation methods lie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16976v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16976v1-abstract-full" style="display: none;"> Achieving meticulous segmentation of tooth point clouds from intra-oral scans stands as an indispensable prerequisite for various orthodontic applications. Given the labor-intensive nature of dental annotation, a significant amount of data remains unlabeled, driving increasing interest in semi-supervised approaches. One primary challenge of existing semi-supervised medical segmentation methods lies in noisy pseudo labels generated for unlabeled data. To address this challenge, we propose GeoT, the first framework that employs instance-dependent transition matrix (IDTM) to explicitly model noise in pseudo labels for semi-supervised dental segmentation. Specifically, to handle the extensive solution space of IDTM arising from tens of thousands of dental points, we introduce tooth geometric priors through two key components: point-level geometric regularization (PLGR) to enhance consistency between point adjacency relationships in 3D and IDTM spaces, and class-level geometric smoothing (CLGS) to leverage the fixed spatial distribution of tooth categories for optimal IDTM estimation. Extensive experiments performed on the public Teeth3DS dataset and private dataset demonstrate that our method can make full utilization of unlabeled data to facilitate segmentation, achieving performance comparable to fully supervised methods with only $20\%$ of the labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16976v1-abstract-full').style.display = 'none'; document.getElementById('2503.16976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IPMI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15234">arXiv:2503.15234</a> <span> [<a href="https://arxiv.org/pdf/2503.15234">pdf</a>, <a href="https://arxiv.org/format/2503.15234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoE: Chain-of-Explanation via Automatic Visual Concept Circuit Description and Polysemanticity Quantification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenlong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chuang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15234v1-abstract-short" style="display: inline;"> Explainability is a critical factor influencing the wide deployment of deep vision models (DVMs). Concept-based post-hoc explanation methods can provide both global and local insights into model decisions. However, current methods in this field face challenges in that they are inflexible to automatically construct accurate and sufficient linguistic explanations for global concepts and local circui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15234v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15234v1-abstract-full" style="display: none;"> Explainability is a critical factor influencing the wide deployment of deep vision models (DVMs). Concept-based post-hoc explanation methods can provide both global and local insights into model decisions. However, current methods in this field face challenges in that they are inflexible to automatically construct accurate and sufficient linguistic explanations for global concepts and local circuits. Particularly, the intrinsic polysemanticity in semantic Visual Concepts (VCs) impedes the interpretability of concepts and DVMs, which is underestimated severely. In this paper, we propose a Chain-of-Explanation (CoE) approach to address these issues. Specifically, CoE automates the decoding and description of VCs to construct global concept explanation datasets. Further, to alleviate the effect of polysemanticity on model explainability, we design a concept polysemanticity disentanglement and filtering mechanism to distinguish the most contextually relevant concept atoms. Besides, a Concept Polysemanticity Entropy (CPE), as a measure of model interpretability, is formulated to quantify the degree of concept uncertainty. The modeling of deterministic concepts is upgraded to uncertain concept atom distributions. Finally, CoE automatically enables linguistic local explanations of the decision-making process of DVMs by tracing the concept circuit. GPT-4o and human-based experiments demonstrate the effectiveness of CPE and the superiority of CoE, achieving an average absolute improvement of 36% in terms of explainability scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15234v1-abstract-full').style.display = 'none'; document.getElementById('2503.15234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14573">arXiv:2503.14573</a> <span> [<a href="https://arxiv.org/pdf/2503.14573">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Three-dimensional Reconstruction of the Lumbar Spine with Submillimeter Accuracy Using Biplanar X-ray Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wanxin Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhemin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yihang Bao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+C">Chunjie Xia</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Rongshan Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yan Yu</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+T">Tsung-Yuan Tsai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14573v1-abstract-short" style="display: inline;"> Three-dimensional reconstruction of the spine under weight-bearing conditions from biplanar X-ray images is of great importance for the clinical assessment of spinal diseases. However, the current fully automated reconstruction methods have low accuracy and fail to meet the clinical application standards. This study developed and validated a fully automated method for high-accuracy 3D reconstructi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14573v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14573v1-abstract-full" style="display: none;"> Three-dimensional reconstruction of the spine under weight-bearing conditions from biplanar X-ray images is of great importance for the clinical assessment of spinal diseases. However, the current fully automated reconstruction methods have low accuracy and fail to meet the clinical application standards. This study developed and validated a fully automated method for high-accuracy 3D reconstruction of the lumbar spine from biplanar X-ray images. The method involves lumbar decomposition and landmark detection from the raw X-ray images, followed by a deformable model and landmark-weighted 2D-3D registration approach. The reconstruction accuracy was validated by the gold standard obtained through the registration of CT-segmented vertebral models with the biplanar X-ray images. The proposed method achieved a 3D reconstruction accuracy of 0.80 mm, representing a significant improvement over the mainstream approaches. This study will contribute to the clinical diagnosis of lumbar in weight-bearing positions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14573v1-abstract-full').style.display = 'none'; document.getElementById('2503.14573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13709">arXiv:2503.13709</a> <span> [<a href="https://arxiv.org/pdf/2503.13709">pdf</a>, <a href="https://arxiv.org/format/2503.13709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Time Series Analysis: A Tutorial and Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yushan Jiang</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+K">Kanghui Ning</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zijie Pan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xuyang Shen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingchao Ni</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenchao Yu</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+A">Anderson Schneider</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Nevmyvaka%2C+Y">Yuriy Nevmyvaka</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dongjin Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13709v1-abstract-short" style="display: inline;"> Multi-modal time series analysis has recently emerged as a prominent research area in data mining, driven by the increasing availability of diverse data modalities, such as text, images, and structured tabular data from real-world sources. However, effective analysis of multi-modal time series is hindered by data heterogeneity, modality gap, misalignment, and inherent noise. Recent advancements in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13709v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13709v1-abstract-full" style="display: none;"> Multi-modal time series analysis has recently emerged as a prominent research area in data mining, driven by the increasing availability of diverse data modalities, such as text, images, and structured tabular data from real-world sources. However, effective analysis of multi-modal time series is hindered by data heterogeneity, modality gap, misalignment, and inherent noise. Recent advancements in multi-modal time series methods have exploited the multi-modal context via cross-modal interactions based on deep learning methods, significantly enhancing various downstream tasks. In this tutorial and survey, we present a systematic and up-to-date overview of multi-modal time series datasets and methods. We first state the existing challenges of multi-modal time series analysis and our motivations, with a brief introduction of preliminaries. Then, we summarize the general pipeline and categorize existing methods through a unified cross-modal interaction framework encompassing fusion, alignment, and transference at different levels (\textit{i.e.}, input, intermediate, output), where key concepts and ideas are highlighted. We also discuss the real-world applications of multi-modal analysis for both standard and spatial time series, tailored to general and specific domains. Finally, we discuss future research directions to help practitioners explore and exploit multi-modal time series. The up-to-date resources are provided in the GitHub repository: https://github.com/UConn-DSIS/Multi-modal-Time-Series-Analysis <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13709v1-abstract-full').style.display = 'none'; document.getElementById('2503.13709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12404">arXiv:2503.12404</a> <span> [<a href="https://arxiv.org/pdf/2503.12404">pdf</a>, <a href="https://arxiv.org/format/2503.12404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAM2-ELNet: Label Enhancement and Automatic Annotation for Remote Sensing Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianhao Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenshuo Yu</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yuanchao Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiance Sun</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Bokang Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12404v1-abstract-short" style="display: inline;"> Remote sensing image segmentation is crucial for environmental monitoring, disaster assessment, and resource management, directly affecting the accuracy and efficiency of surface information extraction. The performance of existing supervised models in remote sensing image segmentation tasks highly depends on the quality of label data. However, current label data mainly relies on manual annotation,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12404v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12404v1-abstract-full" style="display: none;"> Remote sensing image segmentation is crucial for environmental monitoring, disaster assessment, and resource management, directly affecting the accuracy and efficiency of surface information extraction. The performance of existing supervised models in remote sensing image segmentation tasks highly depends on the quality of label data. However, current label data mainly relies on manual annotation, which comes with high time costs and is subject to subjective interference, resulting in distortion of label boundaries and often a loss of detail. To solve the above problems, our work proposes an Edge-enhanced Labeling Network, called SAM2-ELNet, which incorporates a labeling module and an edge attention mechanism. This model effectively addresses issues such as label detail loss, fragmentation, and inaccurate boundaries. Due to the scarcity of manually annotated remote sensing data, the feature extraction capabilities of traditional neural networks are limited. Our method uses the Hiera backbone of the pre-trained self-supervised large model segment anything model 2 (SAM2) as the encoder, achieves high-quality and efficient feature extraction even with small samples by fine-tuning on downstream tasks. This study compared the training effects of original and enhanced labels on the manually annotated Deep-SAR Oil Spill (SOS) dataset. Results showed that the model trained with enhanced labels performed better and had a lower final loss, indicating closer alignment with the real data distribution. Our work also explores the potential of extending the model into an efficient automatic annotation framework through generalization experiments, facilitating large-scale remote sensing image interpretation and intelligent recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12404v1-abstract-full').style.display = 'none'; document.getElementById('2503.12404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10792">arXiv:2503.10792</a> <span> [<a href="https://arxiv.org/pdf/2503.10792">pdf</a>, <a href="https://arxiv.org/format/2503.10792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Byzantine-Resilient Federated Learning via Distributed Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yufei Xia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenrui Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiongxiu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10792v1-abstract-short" style="display: inline;"> Byzantine attacks present a critical challenge to Federated Learning (FL), where malicious participants can disrupt the training process, degrade model accuracy, and compromise system reliability. Traditional FL frameworks typically rely on aggregation-based protocols for model updates, leaving them vulnerable to sophisticated adversarial strategies. In this paper, we demonstrate that distributed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10792v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10792v1-abstract-full" style="display: none;"> Byzantine attacks present a critical challenge to Federated Learning (FL), where malicious participants can disrupt the training process, degrade model accuracy, and compromise system reliability. Traditional FL frameworks typically rely on aggregation-based protocols for model updates, leaving them vulnerable to sophisticated adversarial strategies. In this paper, we demonstrate that distributed optimization offers a principled and robust alternative to aggregation-centric methods. Specifically, we show that the Primal-Dual Method of Multipliers (PDMM) inherently mitigates Byzantine impacts by leveraging its fault-tolerant consensus mechanism. Through extensive experiments on three datasets (MNIST, FashionMNIST, and Olivetti), under various attack scenarios including bit-flipping and Gaussian noise injection, we validate the superior resilience of distributed optimization protocols. Compared to traditional aggregation-centric approaches, PDMM achieves higher model utility, faster convergence, and improved stability. Our results highlight the effectiveness of distributed optimization in defending against Byzantine threats, paving the way for more secure and resilient federated learning systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10792v1-abstract-full').style.display = 'none'; document.getElementById('2503.10792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09977">arXiv:2503.09977</a> <span> [<a href="https://arxiv.org/pdf/2503.09977">pdf</a>, <a href="https://arxiv.org/format/2503.09977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Quadratic Transform for Fractional Programs in Signal Processing and Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+K">Kaiming Shen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09977v1-abstract-short" style="display: inline;"> Fractional programming (FP) is a branch of mathematical optimization that deals with the optimization of ratios. It is an invaluable tool for signal processing and machine learning, because many key metrics in these fields are fractionally structured, e.g., the signal-to-interference-plus-noise ratio (SINR) in wireless communications, the Cramer-Rao bound (CRB) in radar sensing, the normalized cut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09977v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09977v1-abstract-full" style="display: none;"> Fractional programming (FP) is a branch of mathematical optimization that deals with the optimization of ratios. It is an invaluable tool for signal processing and machine learning, because many key metrics in these fields are fractionally structured, e.g., the signal-to-interference-plus-noise ratio (SINR) in wireless communications, the Cramer-Rao bound (CRB) in radar sensing, the normalized cut in graph clustering, and the margin in support vector machine (SVM). This article provides a comprehensive review of both the theory and applications of a recently developed FP technique known as the quadratic transform, which can be applied to a wide variety of FP problems, including both the minimization and the maximization of the sum of functions of ratios as well as matrix ratio problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09977v1-abstract-full').style.display = 'none'; document.getElementById('2503.09977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07505">arXiv:2503.07505</a> <span> [<a href="https://arxiv.org/pdf/2503.07505">pdf</a>, <a href="https://arxiv.org/format/2503.07505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> From Centralized to Decentralized Federated Learning: Theoretical Insights, Privacy Preservation, and Robustness Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiongxiu Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenrui Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yufei Xia</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jun Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07505v1-abstract-short" style="display: inline;"> Federated Learning (FL) enables collaborative learning without directly sharing individual's raw data. FL can be implemented in either a centralized (server-based) or decentralized (peer-to-peer) manner. In this survey, we present a novel perspective: the fundamental difference between centralized FL (CFL) and decentralized FL (DFL) is not merely the network topology, but the underlying training p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07505v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07505v1-abstract-full" style="display: none;"> Federated Learning (FL) enables collaborative learning without directly sharing individual's raw data. FL can be implemented in either a centralized (server-based) or decentralized (peer-to-peer) manner. In this survey, we present a novel perspective: the fundamental difference between centralized FL (CFL) and decentralized FL (DFL) is not merely the network topology, but the underlying training protocol: separate aggregation vs. joint optimization. We argue that this distinction in protocol leads to significant differences in model utility, privacy preservation, and robustness to attacks. We systematically review and categorize existing works in both CFL and DFL according to the type of protocol they employ. This taxonomy provides deeper insights into prior research and clarifies how various approaches relate or differ. Through our analysis, we identify key gaps in the literature. In particular, we observe a surprising lack of exploration of DFL approaches based on distributed optimization methods, despite their potential advantages. We highlight this under-explored direction and call for more research on leveraging distributed optimization for federated learning. Overall, this work offers a comprehensive overview from centralized to decentralized FL, sheds new light on the core distinctions between approaches, and outlines open challenges and future directions for the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07505v1-abstract-full').style.display = 'none'; document.getElementById('2503.07505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06312">arXiv:2503.06312</a> <span> [<a href="https://arxiv.org/pdf/2503.06312">pdf</a>, <a href="https://arxiv.org/format/2503.06312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeoLangBind: Unifying Earth Observation with Agglomerative Vision-Language Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhitong Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weikang Yu</a>, <a href="/search/cs?searchtype=author&query=Stewart%2C+A+J">Adam J Stewart</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jie Zhao</a>, <a href="/search/cs?searchtype=author&query=Lehmann%2C+N">Nils Lehmann</a>, <a href="/search/cs?searchtype=author&query=Dujardin%2C+T">Thomas Dujardin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhenghang Yuan</a>, <a href="/search/cs?searchtype=author&query=Ghamisi%2C+P">Pedram Ghamisi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X+X">Xiao Xiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06312v1-abstract-short" style="display: inline;"> Earth observation (EO) data, collected from diverse sensors with varying imaging principles, present significant challenges in creating unified analytical frameworks. We present GeoLangBind, a novel agglomerative vision--language foundation model that bridges the gap between heterogeneous EO data modalities using language as a unifying medium. Our approach aligns different EO data types into a sha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06312v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06312v1-abstract-full" style="display: none;"> Earth observation (EO) data, collected from diverse sensors with varying imaging principles, present significant challenges in creating unified analytical frameworks. We present GeoLangBind, a novel agglomerative vision--language foundation model that bridges the gap between heterogeneous EO data modalities using language as a unifying medium. Our approach aligns different EO data types into a shared language embedding space, enabling seamless integration and complementary feature learning from diverse sensor data. To achieve this, we construct a large-scale multimodal image--text dataset, GeoLangBind-2M, encompassing six data modalities. GeoLangBind leverages this dataset to develop a zero-shot foundation model capable of processing arbitrary numbers of EO data channels as input. Through our designed Modality-aware Knowledge Agglomeration (MaKA) module and progressive multimodal weight merging strategy, we create a powerful agglomerative foundation model that excels in both zero-shot vision--language comprehension and fine-grained visual understanding. Extensive evaluation across 23 datasets covering multiple tasks demonstrates GeoLangBind's superior performance and versatility in EO applications, offering a robust framework for various environmental monitoring and analysis tasks. The dataset and pretrained models will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06312v1-abstract-full').style.display = 'none'; document.getElementById('2503.06312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">code & weights: https://github.com/xiong-zhitong/GeoLB-SigLIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05035">arXiv:2503.05035</a> <span> [<a href="https://arxiv.org/pdf/2503.05035">pdf</a>, <a href="https://arxiv.org/format/2503.05035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> QuietPaw: Learning Quadrupedal Locomotion with Versatile Noise Preference Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuyou Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yihang Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiqi Liu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yaru Niu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Changyi Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuxiang Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingnan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Ding Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05035v1-abstract-short" style="display: inline;"> When operating at their full capacity, quadrupedal robots can produce loud footstep noise, which can be disruptive in human-centered environments like homes, offices, and hospitals. As a result, balancing locomotion performance with noise constraints is crucial for the successful real-world deployment of quadrupedal robots. However, achieving adaptive noise control is challenging due to (a) the tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05035v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05035v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05035v1-abstract-full" style="display: none;"> When operating at their full capacity, quadrupedal robots can produce loud footstep noise, which can be disruptive in human-centered environments like homes, offices, and hospitals. As a result, balancing locomotion performance with noise constraints is crucial for the successful real-world deployment of quadrupedal robots. However, achieving adaptive noise control is challenging due to (a) the trade-off between agility and noise minimization, (b) the need for generalization across diverse deployment conditions, and (c) the difficulty of effectively adjusting policies based on noise requirements. We propose QuietPaw, a framework incorporating our Conditional Noise-Constrained Policy (CNCP), a constrained learning-based algorithm that enables flexible, noise-aware locomotion by conditioning policy behavior on noise-reduction levels. We leverage value representation decomposition in the critics, disentangling state representations from condition-dependent representations and this allows a single versatile policy to generalize across noise levels without retraining while improving the Pareto trade-off between agility and noise reduction. We validate our approach in simulation and the real world, demonstrating that CNCP can effectively balance locomotion performance and noise constraints, achieving continuously adjustable noise reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05035v1-abstract-full').style.display = 'none'; document.getElementById('2503.05035v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02424">arXiv:2503.02424</a> <span> [<a href="https://arxiv.org/pdf/2503.02424">pdf</a>, <a href="https://arxiv.org/format/2503.02424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Intrinsic Normal Prototypes within a Single Image for Universal Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yunkang Cao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Haiming Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaotian Zhang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+J">Jianan Lou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuqi Cheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weiming Shen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02424v1-abstract-short" style="display: inline;"> Anomaly detection (AD) is essential for industrial inspection, yet existing methods typically rely on ``comparing'' test images to normal references from a training set. However, variations in appearance and positioning often complicate the alignment of these references with the test image, limiting detection accuracy. We observe that most anomalies manifest as local variations, meaning that even… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02424v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02424v1-abstract-full" style="display: none;"> Anomaly detection (AD) is essential for industrial inspection, yet existing methods typically rely on ``comparing'' test images to normal references from a training set. However, variations in appearance and positioning often complicate the alignment of these references with the test image, limiting detection accuracy. We observe that most anomalies manifest as local variations, meaning that even within anomalous images, valuable normal information remains. We argue that this information is useful and may be more aligned with the anomalies since both the anomalies and the normal information originate from the same image. Therefore, rather than relying on external normality from the training set, we propose INP-Former, a novel method that extracts Intrinsic Normal Prototypes (INPs) directly from the test image. Specifically, we introduce the INP Extractor, which linearly combines normal tokens to represent INPs. We further propose an INP Coherence Loss to ensure INPs can faithfully represent normality for the testing image. These INPs then guide the INP-Guided Decoder to reconstruct only normal tokens, with reconstruction errors serving as anomaly scores. Additionally, we propose a Soft Mining Loss to prioritize hard-to-optimize samples during training. INP-Former achieves state-of-the-art performance in single-class, multi-class, and few-shot AD tasks across MVTec-AD, VisA, and Real-IAD, positioning it as a versatile and universal solution for AD. Remarkably, INP-Former also demonstrates some zero-shot AD capability. Code is available at:https://github.com/luow23/INP-Former. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02424v1-abstract-full').style.display = 'none'; document.getElementById('2503.02424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02375">arXiv:2503.02375</a> <span> [<a href="https://arxiv.org/pdf/2503.02375">pdf</a>, <a href="https://arxiv.org/format/2503.02375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> mmDEAR: mmWave Point Cloud Density Enhancement for Accurate Human Body Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiarui Yang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Songpengcheng Xia</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zengyuan Lai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lan Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenxian Yu</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02375v1-abstract-short" style="display: inline;"> Millimeter-wave (mmWave) radar offers robust sensing capabilities in diverse environments, making it a highly promising solution for human body reconstruction due to its privacy-friendly and non-intrusive nature. However, the significant sparsity of mmWave point clouds limits the estimation accuracy. To overcome this challenge, we propose a two-stage deep learning framework that enhances mmWave po… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02375v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02375v1-abstract-full" style="display: none;"> Millimeter-wave (mmWave) radar offers robust sensing capabilities in diverse environments, making it a highly promising solution for human body reconstruction due to its privacy-friendly and non-intrusive nature. However, the significant sparsity of mmWave point clouds limits the estimation accuracy. To overcome this challenge, we propose a two-stage deep learning framework that enhances mmWave point clouds and improves human body reconstruction accuracy. Our method includes a mmWave point cloud enhancement module that densifies the raw data by leveraging temporal features and a multi-stage completion network, followed by a 2D-3D fusion module that extracts both 2D and 3D motion features to refine SMPL parameters. The mmWave point cloud enhancement module learns the detailed shape and posture information from 2D human masks in single-view images. However, image-based supervision is involved only during the training phase, and the inference relies solely on sparse point clouds to maintain privacy. Experiments on multiple datasets demonstrate that our approach outperforms state-of-the-art methods, with the enhanced point clouds further improving performance when integrated into existing models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02375v1-abstract-full').style.display = 'none'; document.getElementById('2503.02375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01711">arXiv:2503.01711</a> <span> [<a href="https://arxiv.org/pdf/2503.01711">pdf</a>, <a href="https://arxiv.org/format/2503.01711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MAPS: Motivation-Aware Personalized Search via LLM-Driven Consultation Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+W">Weicong Qin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weijie Yu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenglei Shen</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Ming He</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jianping Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01711v3-abstract-short" style="display: inline;"> Personalized product search aims to retrieve and rank items that match users' preferences and search intent. Despite their effectiveness, existing approaches typically assume that users' query fully captures their real motivation. However, our analysis of a real-world e-commerce platform reveals that users often engage in relevant consultations before searching, indicating they refine intents thro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01711v3-abstract-full').style.display = 'inline'; document.getElementById('2503.01711v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01711v3-abstract-full" style="display: none;"> Personalized product search aims to retrieve and rank items that match users' preferences and search intent. Despite their effectiveness, existing approaches typically assume that users' query fully captures their real motivation. However, our analysis of a real-world e-commerce platform reveals that users often engage in relevant consultations before searching, indicating they refine intents through consultations based on motivation and need. The implied motivation in consultations is a key enhancing factor for personalized search. This unexplored area comes with new challenges including aligning contextual motivations with concise queries, bridging the category-text gap, and filtering noise within sequence history. To address these, we propose a Motivation-Aware Personalized Search (MAPS) method. It embeds queries and consultations into a unified semantic space via LLMs, utilizes a Mixture of Attention Experts (MoAE) to prioritize critical semantics, and introduces dual alignment: (1) contrastive learning aligns consultations, reviews, and product features; (2) bidirectional attention integrates motivation-aware embeddings with user preferences. Extensive experiments on real and synthetic data show MAPS outperforms existing methods in both retrieval and ranking tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01711v3-abstract-full').style.display = 'none'; document.getElementById('2503.01711v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">added project repository & dataset URL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01013">arXiv:2503.01013</a> <span> [<a href="https://arxiv.org/pdf/2503.01013">pdf</a>, <a href="https://arxiv.org/format/2503.01013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Explainable Multi-modal Time Series Prediction with LLM-in-the-Loop </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yushan Jiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenchao Yu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geon Lee</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dongjin Song</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+K">Kijung Shin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanchi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01013v1-abstract-short" style="display: inline;"> Time series analysis provides essential insights for real-world system dynamics and informs downstream decision-making, yet most existing methods often overlook the rich contextual signals present in auxiliary modalities. To bridge this gap, we introduce TimeXL, a multi-modal prediction framework that integrates a prototype-based time series encoder with three collaborating Large Language Models (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01013v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01013v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01013v1-abstract-full" style="display: none;"> Time series analysis provides essential insights for real-world system dynamics and informs downstream decision-making, yet most existing methods often overlook the rich contextual signals present in auxiliary modalities. To bridge this gap, we introduce TimeXL, a multi-modal prediction framework that integrates a prototype-based time series encoder with three collaborating Large Language Models (LLMs) to deliver more accurate predictions and interpretable explanations. First, a multi-modal prototype-based encoder processes both time series and textual inputs to generate preliminary forecasts alongside case-based rationales. These outputs then feed into a prediction LLM, which refines the forecasts by reasoning over the encoder's predictions and explanations. Next, a reflection LLM compares the predicted values against the ground truth, identifying textual inconsistencies or noise. Guided by this feedback, a refinement LLM iteratively enhances text quality and triggers encoder retraining. This closed-loop workflow -- prediction, critique (reflect), and refinement -- continuously boosts the framework's performance and interpretability. Empirical evaluations on four real-world datasets demonstrate that TimeXL achieves up to 8.9\% improvement in AUC and produces human-centric, multi-modal explanations, highlighting the power of LLM-driven reasoning for time series prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01013v1-abstract-full').style.display = 'none'; document.getElementById('2503.01013v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19779">arXiv:2502.19779</a> <span> [<a href="https://arxiv.org/pdf/2502.19779">pdf</a>, <a href="https://arxiv.org/format/2502.19779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Do Retrieval-Augmented Language Models Adapt to Varying User Needs? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peilin Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinlu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinya Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z+Z">Zhiyu Zoey Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19779v1-abstract-short" style="display: inline;"> Recent advancements in Retrieval-Augmented Language Models (RALMs) have demonstrated their efficacy in knowledge-intensive tasks. However, existing evaluation benchmarks often assume a single optimal approach to leveraging retrieved information, failing to account for varying user needs. This paper introduces a novel evaluation framework that systematically assesses RALMs under three user need cas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19779v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19779v1-abstract-full" style="display: none;"> Recent advancements in Retrieval-Augmented Language Models (RALMs) have demonstrated their efficacy in knowledge-intensive tasks. However, existing evaluation benchmarks often assume a single optimal approach to leveraging retrieved information, failing to account for varying user needs. This paper introduces a novel evaluation framework that systematically assesses RALMs under three user need cases-Context-Exclusive, Context-First, and Memory-First-across three distinct context settings: Context Matching, Knowledge Conflict, and Information Irrelevant. By varying both user instructions and the nature of retrieved information, our approach captures the complexities of real-world applications where models must adapt to diverse user requirements. Through extensive experiments on multiple QA datasets, including HotpotQA, DisentQA, and our newly constructed synthetic URAQ dataset, we find that restricting memory usage improves robustness in adversarial retrieval conditions but decreases peak performance with ideal retrieval results and model family dominates behavioral differences. Our findings highlight the necessity of user-centric evaluations in the development of retrieval-augmented systems and provide insights into optimizing model performance across varied retrieval contexts. We will release our code and URAQ dataset upon acceptance of the paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19779v1-abstract-full').style.display = 'none'; document.getElementById('2502.19779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18482">arXiv:2502.18482</a> <span> [<a href="https://arxiv.org/pdf/2502.18482">pdf</a>, <a href="https://arxiv.org/format/2502.18482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MixLLM: Dynamic Routing in Mixed Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanchi Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xujiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengzhang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenchao Yu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanjie Fu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18482v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) exhibit potential artificial generic intelligence recently, however, their usage is costly with high response latency. Given mixed LLMs with their own strengths and weaknesses, LLM routing aims to identify the most suitable model for each query in the stream to maximize response quality and minimize cost and latency. However, the challenges involve: (1) dynamic trade-o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18482v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18482v1-abstract-full" style="display: none;"> Large Language Models (LLMs) exhibit potential artificial generic intelligence recently, however, their usage is costly with high response latency. Given mixed LLMs with their own strengths and weaknesses, LLM routing aims to identify the most suitable model for each query in the stream to maximize response quality and minimize cost and latency. However, the challenges involve: (1) dynamic trade-offs among quality, cost, and latency; (2) enabling continual learning in deployed systems; and (3) navigating a varying (e.g., new LLM addition or old LLM removal) set of LLM candidates over time. To bridge these gaps, we develop MixLLM, a dynamic contextual-bandit-based routing system for query-LLM assignment. Specifically, we first leverage query tags to enhance query embeddings for the routing task. Next, we design lightweight prediction models to estimate the response qualities and costs of queries over LLMs. We then devise a meta-decision maker to choose the query-LLM assignments to best tradeoff response quality, cost, and latency. Finally, the system benefits from continual training, allowing it to adapt to evolving queries and user feedback over time. Our extensive experiments show that MixLLM achieves the best trade-offs in response quality, cost, and latency (97.25% of GPT-4's quality at 24.18% of the cost under the time constraint). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18482v1-abstract-full').style.display = 'none'; document.getElementById('2502.18482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures, accepted by NAACL 2025 main conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> N/A </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16161">arXiv:2502.16161</a> <span> [<a href="https://arxiv.org/pdf/2502.16161">pdf</a>, <a href="https://arxiv.org/format/2502.16161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OmniParser V2: Structured-Points-of-Thought for Unified Visual Text Parsing and Its Generality to Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenwen Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhibo Yang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+J">Jianqiang Wan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Sibo Song</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jun Tang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wenqing Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuliang Liu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16161v1-abstract-short" style="display: inline;"> Visually-situated text parsing (VsTP) has recently seen notable advancements, driven by the growing demand for automated document understanding and the emergence of large language models capable of processing document-based questions. While various methods have been proposed to tackle the complexities of VsTP, existing solutions often rely on task-specific architectures and objectives for individu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16161v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16161v1-abstract-full" style="display: none;"> Visually-situated text parsing (VsTP) has recently seen notable advancements, driven by the growing demand for automated document understanding and the emergence of large language models capable of processing document-based questions. While various methods have been proposed to tackle the complexities of VsTP, existing solutions often rely on task-specific architectures and objectives for individual tasks. This leads to modal isolation and complex workflows due to the diversified targets and heterogeneous schemas. In this paper, we introduce OmniParser V2, a universal model that unifies VsTP typical tasks, including text spotting, key information extraction, table recognition, and layout analysis, into a unified framework. Central to our approach is the proposed Structured-Points-of-Thought (SPOT) prompting schemas, which improves model performance across diverse scenarios by leveraging a unified encoder-decoder architecture, objective, and input\&output representation. SPOT eliminates the need for task-specific architectures and loss functions, significantly simplifying the processing pipeline. Our extensive evaluations across four tasks on eight different datasets show that OmniParser V2 achieves state-of-the-art or competitive results in VsTP. Additionally, we explore the integration of SPOT within a multimodal large language model structure, further enhancing text localization and recognition capabilities, thereby confirming the generality of SPOT prompting technique. The code is available at \href{https://github.com/AlibabaResearch/AdvancedLiterateMachinery}{AdvancedLiterateMachinery}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16161v1-abstract-full').style.display = 'none'; document.getElementById('2502.16161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14133">arXiv:2502.14133</a> <span> [<a href="https://arxiv.org/pdf/2502.14133">pdf</a>, <a href="https://arxiv.org/format/2502.14133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Regularization with Latent Space Explanations for Controllable LLM-based Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xuansheng Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoming Zhai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14133v1-abstract-short" style="display: inline;"> Modern text classification methods heavily rely on contextual embeddings from large language models (LLMs). Compared to human-engineered features, these embeddings provide automatic and effective representations for classification model training. However, they also introduce a challenge: we lose the ability to manually remove unintended features, such as sensitive or task-irrelevant features, to g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14133v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14133v1-abstract-full" style="display: none;"> Modern text classification methods heavily rely on contextual embeddings from large language models (LLMs). Compared to human-engineered features, these embeddings provide automatic and effective representations for classification model training. However, they also introduce a challenge: we lose the ability to manually remove unintended features, such as sensitive or task-irrelevant features, to guarantee regulatory compliance or improve the generalizability of classification models. This limitation arises because LLM embeddings are opaque and difficult to interpret. In this paper, we propose a novel framework to identify and regularize unintended features in the LLM latent space. Specifically, we first pre-train a sparse autoencoder (SAE) to extract interpretable features from LLM latent spaces. To ensure the SAE can capture task-specific features, we further fine-tune it on task-specific datasets. In training the classification model, we propose a simple and effective regularizer, by minimizing the similarity between the classifier weights and the identified unintended feature, to remove the impacts of these unintended features toward classification. We evaluate the proposed framework on three real-world tasks, including toxic chat detection, reward modeling, and disease diagnosis. Results show that the proposed framework can significantly improve the classifier's generalizability by regularizing those features that are not semantically correlated to each task. This work pioneers controllable text classification on LLM latent spaces by leveraging interpreted features to address generalizability, fairness, and privacy challenges. We will release our code and data once accepted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14133v1-abstract-full').style.display = 'none'; document.getElementById('2502.14133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pre-print, 15 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13732">arXiv:2502.13732</a> <span> [<a href="https://arxiv.org/pdf/2502.13732">pdf</a>, <a href="https://arxiv.org/format/2502.13732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Homophily Heterogeneity Matters in Graph Federated Learning: A Spectrum Sharing and Complementing Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wentao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13732v1-abstract-short" style="display: inline;"> Since heterogeneity presents a fundamental challenge in graph federated learning, many existing methods are proposed to deal with node feature heterogeneity and structure heterogeneity. However, they overlook the critical homophily heterogeneity, which refers to the substantial variation in homophily levels across graph data from different clients. The homophily level represents the proportion of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13732v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13732v1-abstract-full" style="display: none;"> Since heterogeneity presents a fundamental challenge in graph federated learning, many existing methods are proposed to deal with node feature heterogeneity and structure heterogeneity. However, they overlook the critical homophily heterogeneity, which refers to the substantial variation in homophily levels across graph data from different clients. The homophily level represents the proportion of edges connecting nodes that belong to the same class. Due to adapting to their local homophily, local models capture inconsistent spectral properties across different clients, significantly reducing the effectiveness of collaboration. Specifically, local models trained on graphs with high homophily tend to capture low-frequency information, whereas local models trained on graphs with low homophily tend to capture high-frequency information. To effectively deal with homophily heterophily, we introduce the spectral Graph Neural Network (GNN) and propose a novel Federated learning method by mining Graph Spectral Properties (FedGSP). On one hand, our proposed FedGSP enables clients to share generic spectral properties (i.e., low-frequency information), allowing all clients to benefit through collaboration. On the other hand, inspired by our theoretical findings, our proposed FedGSP allows clients to complement non-generic spectral properties by acquiring the spectral properties they lack (i.e., high-frequency information), thereby obtaining additional information gain. Extensive experiments conducted on six homophilic and five heterophilic graph datasets, across both non-overlapping and overlapping settings, validate the superiority of our method over eleven state-of-the-art methods. Notably, our FedGSP outperforms the second-best method by an average margin of 3.28% on all heterophilic datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13732v1-abstract-full').style.display = 'none'; document.getElementById('2502.13732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13358">arXiv:2502.13358</a> <span> [<a href="https://arxiv.org/pdf/2502.13358">pdf</a>, <a href="https://arxiv.org/format/2502.13358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Editing Gap in LLMs: FineEdit for Precise and Targeted Text Modifications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yiming Zeng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wanhao Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zexin Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+T">Tao Ren</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yu Ma</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jinghan Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiyan Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tingting Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13358v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have transformed natural language processing, yet they still struggle with direct text editing tasks that demand precise, context-aware modifications. While models like ChatGPT excel in text generation and analysis, their editing abilities often fall short, addressing only superficial issues rather than deeper structural or logical inconsistencies. In this work, we int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13358v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have transformed natural language processing, yet they still struggle with direct text editing tasks that demand precise, context-aware modifications. While models like ChatGPT excel in text generation and analysis, their editing abilities often fall short, addressing only superficial issues rather than deeper structural or logical inconsistencies. In this work, we introduce a dual approach to enhance LLMs editing performance. First, we present InstrEditBench, a high-quality benchmark dataset comprising over 20,000 structured editing tasks spanning Wiki articles, LaTeX documents, code, and database Domain-specific Languages (DSL). InstrEditBench is generated using an innovative automated workflow that accurately identifies and evaluates targeted edits, ensuring that modifications adhere strictly to specified instructions without altering unrelated content. Second, we propose FineEdit, a specialized model trained on this curated benchmark. Experimental results demonstrate that FineEdit achieves significant improvements around {10\%} compared with Gemini on direct editing tasks, convincingly validating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13358v1-abstract-full').style.display = 'none'; document.getElementById('2502.13358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12897">arXiv:2502.12897</a> <span> [<a href="https://arxiv.org/pdf/2502.12897">pdf</a>, <a href="https://arxiv.org/ps/2502.12897">ps</a>, <a href="https://arxiv.org/format/2502.12897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> On Zero Skip-Cost Generalized Fractional-Repetition Codes from Covering Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenjun Yu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo-Jun Yuan</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+M">Moshe Schwartz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12897v1-abstract-short" style="display: inline;"> We study generalized fractional repetition codes that have zero skip cost, and which are based on covering designs. We show that a zero skip cost is always attainable, perhaps at a price of an expansion factor compared with the optimal size of fractional repetition codes based on Steiner systems. We provide three constructions, as well as show non-constructively, that no expansion is needed for al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12897v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12897v1-abstract-full" style="display: none;"> We study generalized fractional repetition codes that have zero skip cost, and which are based on covering designs. We show that a zero skip cost is always attainable, perhaps at a price of an expansion factor compared with the optimal size of fractional repetition codes based on Steiner systems. We provide three constructions, as well as show non-constructively, that no expansion is needed for all codes based on sufficiently large covering systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12897v1-abstract-full').style.display = 'none'; document.getElementById('2502.12897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11965">arXiv:2502.11965</a> <span> [<a href="https://arxiv.org/pdf/2502.11965">pdf</a>, <a href="https://arxiv.org/format/2502.11965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A MIMO Wireless Channel Foundation Model via CIR-CSI Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jun Jiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenjun Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunfan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11965v2-abstract-short" style="display: inline;"> In the field of artificial intelligence, self-supervised learning has demonstrated superior generalization capabilities by leveraging large-scale unlabeled datasets for pretraining, which is especially critical for wireless communication models to adapt to a variety of scenarios. This paper innovatively treats Channel State Information (CSI) and Channel Impulse Response (CIR) as naturally aligned… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11965v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11965v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11965v2-abstract-full" style="display: none;"> In the field of artificial intelligence, self-supervised learning has demonstrated superior generalization capabilities by leveraging large-scale unlabeled datasets for pretraining, which is especially critical for wireless communication models to adapt to a variety of scenarios. This paper innovatively treats Channel State Information (CSI) and Channel Impulse Response (CIR) as naturally aligned multi-modal data and proposes the first MIMO wireless channel foundation model, named CSI-CLIP. By effectively capturing the joint representations of both CIR and CSI, CSI-CLIP exhibits remarkable adaptability across scenarios and robust feature extraction capabilities. Experimental results show that in positioning task, CSI-CLIP reduces the mean error distance by 22%; in beam management task, it increases accuracy by 1% compared to traditional supervised methods, as well as in the channel identification task. These improvements not only highlight the potential and value of CSI-CLIP in integrating sensing and communication but also demonstrate its significant advantages over existing techniques. Moreover, viewing CSI and CIR as multi-modal pairs and contrastive learning for wireless channel foundation model open up new research directions in the domain of MIMO wireless communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11965v2-abstract-full').style.display = 'none'; document.getElementById('2502.11965v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2025 ICMLCN accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11418">arXiv:2502.11418</a> <span> [<a href="https://arxiv.org/pdf/2502.11418">pdf</a>, <a href="https://arxiv.org/format/2502.11418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TimeCAP: Learning to Contextualize, Augment, and Predict Time Series Events with Large Language Model Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geon Lee</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenchao Yu</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+K">Kijung Shin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11418v2-abstract-short" style="display: inline;"> Time series data is essential in various applications, including climate modeling, healthcare monitoring, and financial analytics. Understanding the contextual information associated with real-world time series data is often essential for accurate and reliable event predictions. In this paper, we introduce TimeCAP, a time-series processing framework that creatively employs Large Language Models (L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11418v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11418v2-abstract-full" style="display: none;"> Time series data is essential in various applications, including climate modeling, healthcare monitoring, and financial analytics. Understanding the contextual information associated with real-world time series data is often essential for accurate and reliable event predictions. In this paper, we introduce TimeCAP, a time-series processing framework that creatively employs Large Language Models (LLMs) as contextualizers of time series data, extending their typical usage as predictors. TimeCAP incorporates two independent LLM agents: one generates a textual summary capturing the context of the time series, while the other uses this enriched summary to make more informed predictions. In addition, TimeCAP employs a multi-modal encoder that synergizes with the LLM agents, enhancing predictive performance through mutual augmentation of inputs with in-context examples. Experimental results on real-world datasets demonstrate that TimeCAP outperforms state-of-the-art methods for time series event prediction, including those utilizing LLMs as predictors, achieving an average improvement of 28.75% in F1 score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11418v2-abstract-full').style.display = 'none'; document.getElementById('2502.11418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10628">arXiv:2502.10628</a> <span> [<a href="https://arxiv.org/pdf/2502.10628">pdf</a>, <a href="https://arxiv.org/format/2502.10628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> On Self-Adaptive Perception Loss Function for Sequential Lossy Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salehkalaibar%2C+S">Sadaf Salehkalaibar</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+B">Buu Phan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+L">Likun Cai</a>, <a href="/search/cs?searchtype=author&query=Dick%2C+J+A">Joao Atz Dick</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wei Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a>, <a href="/search/cs?searchtype=author&query=Khisti%2C+A">Ashish Khisti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10628v1-abstract-short" style="display: inline;"> We consider causal, low-latency, sequential lossy compression, with mean squared-error (MSE) as the distortion loss, and a perception loss function (PLF) to enhance the realism of reconstructions. As the main contribution, we propose and analyze a new PLF that considers the joint distribution between the current source frame and the previous reconstructions. We establish the theoretical rate-disto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10628v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10628v1-abstract-full" style="display: none;"> We consider causal, low-latency, sequential lossy compression, with mean squared-error (MSE) as the distortion loss, and a perception loss function (PLF) to enhance the realism of reconstructions. As the main contribution, we propose and analyze a new PLF that considers the joint distribution between the current source frame and the previous reconstructions. We establish the theoretical rate-distortion-perception function for first-order Markov sources and analyze the Gaussian model in detail. From a qualitative perspective, the proposed metric can simultaneously avoid the error-permanence phenomenon and also better exploit the temporal correlation between high-quality reconstructions. The proposed metric is referred to as self-adaptive perception loss function (PLF-SA), as its behavior adapts to the quality of reconstructed frames. We provide a detailed comparison of the proposed perception loss function with previous approaches through both information theoretic analysis as well as experiments involving moving MNIST and UVG datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10628v1-abstract-full').style.display = 'none'; document.getElementById('2502.10628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2305.19301</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09994">arXiv:2502.09994</a> <span> [<a href="https://arxiv.org/pdf/2502.09994">pdf</a>, <a href="https://arxiv.org/format/2502.09994">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Decision Information Meets Large Language Models: The Future of Explainable Operations Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yansen Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Q">Qingcan Kang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W+Y">Wing Yin Yu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+H">Hailei Gong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiaojin Fu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiongwei Han</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+T">Tao Zhong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09994v1-abstract-short" style="display: inline;"> Operations Research (OR) is vital for decision-making in many industries. While recent OR methods have seen significant improvements in automation and efficiency through integrating Large Language Models (LLMs), they still struggle to produce meaningful explanations. This lack of clarity raises concerns about transparency and trustworthiness in OR applications. To address these challenges, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09994v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09994v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09994v1-abstract-full" style="display: none;"> Operations Research (OR) is vital for decision-making in many industries. While recent OR methods have seen significant improvements in automation and efficiency through integrating Large Language Models (LLMs), they still struggle to produce meaningful explanations. This lack of clarity raises concerns about transparency and trustworthiness in OR applications. To address these challenges, we propose a comprehensive framework, Explainable Operations Research (EOR), emphasizing actionable and understandable explanations accompanying optimization. The core of EOR is the concept of Decision Information, which emerges from what-if analysis and focuses on evaluating the impact of complex constraints (or parameters) changes on decision-making. Specifically, we utilize bipartite graphs to quantify the changes in the OR model and adopt LLMs to improve the explanation capabilities. Additionally, we introduce the first industrial benchmark to rigorously evaluate the effectiveness of explanations and analyses in OR, establishing a new standard for transparency and clarity in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09994v1-abstract-full').style.display = 'none'; document.getElementById('2502.09994v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06676">arXiv:2502.06676</a> <span> [<a href="https://arxiv.org/pdf/2502.06676">pdf</a>, <a href="https://arxiv.org/format/2502.06676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Discovery of skill switching criteria for learning agile quadruped locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wanming Yu</a>, <a href="/search/cs?searchtype=author&query=Acero%2C+F">Fernando Acero</a>, <a href="/search/cs?searchtype=author&query=Atanassov%2C+V">Vassil Atanassov</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Havoutis%2C+I">Ioannis Havoutis</a>, <a href="/search/cs?searchtype=author&query=Kanoulas%2C+D">Dimitrios Kanoulas</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhibin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06676v1-abstract-short" style="display: inline;"> This paper develops a hierarchical learning and optimization framework that can learn and achieve well-coordinated multi-skill locomotion. The learned multi-skill policy can switch between skills automatically and naturally in tracking arbitrarily positioned goals and recover from failures promptly. The proposed framework is composed of a deep reinforcement learning process and an optimization pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06676v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06676v1-abstract-full" style="display: none;"> This paper develops a hierarchical learning and optimization framework that can learn and achieve well-coordinated multi-skill locomotion. The learned multi-skill policy can switch between skills automatically and naturally in tracking arbitrarily positioned goals and recover from failures promptly. The proposed framework is composed of a deep reinforcement learning process and an optimization process. First, the contact pattern is incorporated into the reward terms for learning different types of gaits as separate policies without the need for any other references. Then, a higher level policy is learned to generate weights for individual policies to compose multi-skill locomotion in a goal-tracking task setting. Skills are automatically and naturally switched according to the distance to the goal. The proper distances for skill switching are incorporated in reward calculation for learning the high level policy and updated by an outer optimization loop as learning progresses. We first demonstrated successful multi-skill locomotion in comprehensive tasks on a simulated Unitree A1 quadruped robot. We also deployed the learned policy in the real world showcasing trotting, bounding, galloping, and their natural transitions as the goal position changes. Moreover, the learned policy can react to unexpected failures at any time, perform prompt recovery, and resume locomotion successfully. Compared to discrete switch between single skills which failed to transition to galloping in the real world, our proposed approach achieves all the learned agile skills, with smoother and more continuous skill transitions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06676v1-abstract-full').style.display = 'none'; document.getElementById('2502.06676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05222">arXiv:2502.05222</a> <span> [<a href="https://arxiv.org/pdf/2502.05222">pdf</a>, <a href="https://arxiv.org/format/2502.05222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> VistaFlow: Photorealistic Volumetric Reconstruction with Dynamic Resolution Management via Q-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Palamadai%2C+J">Jayram Palamadai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">William Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05222v1-abstract-short" style="display: inline;"> We introduce VistaFlow, a scalable three-dimensional imaging technique capable of reconstructing fully interactive 3D volumetric images from a set of 2D photographs. Our model synthesizes novel viewpoints through a differentiable rendering system capable of dynamic resolution management on photorealistic 3D scenes. We achieve this through the introduction of QuiQ, a novel intermediate video contro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05222v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05222v1-abstract-full" style="display: none;"> We introduce VistaFlow, a scalable three-dimensional imaging technique capable of reconstructing fully interactive 3D volumetric images from a set of 2D photographs. Our model synthesizes novel viewpoints through a differentiable rendering system capable of dynamic resolution management on photorealistic 3D scenes. We achieve this through the introduction of QuiQ, a novel intermediate video controller trained through Q-learning to maintain a consistently high framerate by adjusting render resolution with millisecond precision. Notably, VistaFlow runs natively on integrated CPU graphics, making it viable for mobile and entry-level devices while still delivering high-performance rendering. VistaFlow bypasses Neural Radiance Fields (NeRFs), using the PlenOctree data structure to render complex light interactions such as reflection and subsurface scattering with minimal hardware requirements. Our model is capable of outperforming state-of-the-art methods with novel view synthesis at a resolution of 1080p at over 100 frames per second on consumer hardware. By tailoring render quality to the capabilities of each device, VistaFlow has the potential to improve the efficiency and accessibility of photorealistic 3D scene rendering across a wide spectrum of hardware, from high-end workstations to inexpensive microcontrollers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05222v1-abstract-full').style.display = 'none'; document.getElementById('2502.05222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03884">arXiv:2502.03884</a> <span> [<a href="https://arxiv.org/pdf/2502.03884">pdf</a>, <a href="https://arxiv.org/format/2502.03884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rank Also Matters: Hierarchical Configuration for Mixture of Adapter Experts in LLM Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cong%2C+P">Peizhuang Cong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenpu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haochen Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03884v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable success across various tasks, accompanied by a continuous increase in their parameter size. Parameter-efficient fine-tuning (PEFT) methods, such as Low-Rank Adaptation (LoRA), address the challenges of fine-tuning LLMs by significantly reducing the number of trainable parameters. Recent studies have integrated LoRA with Mixture of Experts (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03884v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03884v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable success across various tasks, accompanied by a continuous increase in their parameter size. Parameter-efficient fine-tuning (PEFT) methods, such as Low-Rank Adaptation (LoRA), address the challenges of fine-tuning LLMs by significantly reducing the number of trainable parameters. Recent studies have integrated LoRA with Mixture of Experts (MoE) architectures, leveraging multiple adapter experts and gating mechanisms to further improve fine-tuning performance. However, existing approaches primarily focus on adjusting the allocations of adapter experts per layer to optimize the introduced trainable parameter size, while neglecting a critical factor of adapters' rank. To this end, we propose a hierarchical scheme for expert allocation and rank configuration, HILO, which dynamically adjusts the number and rank of adapter experts across layers, matching the varying representational complexity of model layers in adapter-granularity. Extensive experiments on multiple benchmark tasks demonstrate that HILO outperforms existing methods in accuracy while introducing fewer trainable parameters, providing an efficient and practical solution for fine-tuning LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03884v1-abstract-full').style.display = 'none'; document.getElementById('2502.03884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16759">arXiv:2501.16759</a> <span> [<a href="https://arxiv.org/pdf/2501.16759">pdf</a>, <a href="https://arxiv.org/format/2501.16759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Are Joins over LSM-trees Ready: Take RocksDB as an Example </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weiping Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Siqiang Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16759v2-abstract-short" style="display: inline;"> LSM-tree-based data stores are widely adopted in industries for their excellent performance. As data scales increase, disk-based join operations become indispensable yet costly for the database, making the selection of suitable join methods crucial for system optimization. Current LSM-based stores generally adhere to conventional relational database practices and support only a limited number of j… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16759v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16759v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16759v2-abstract-full" style="display: none;"> LSM-tree-based data stores are widely adopted in industries for their excellent performance. As data scales increase, disk-based join operations become indispensable yet costly for the database, making the selection of suitable join methods crucial for system optimization. Current LSM-based stores generally adhere to conventional relational database practices and support only a limited number of join methods. However, the LSM-tree delivers distinct read and write efficiency compared to the relational databases, which could accordingly impact the performance of various join methods. Therefore, it is necessary to reconsider the selection of join methods in this context to fully explore the potential of various join algorithms and index designs. In this work, we present a systematic study and an exhaustive benchmark for joins over LSM-trees. We define a configuration space for join methods, encompassing various join algorithms, secondary index types, and consistency strategies. We also summarize a theoretical analysis to evaluate the overhead of each join method for an in-depth understanding. Furthermore, we implement all join methods in the configuration space on a unified platform and compare their performance through extensive experiments. Our theoretical and experimental results yield several insights and takeaways tailored to joins in LSM-based stores that aid developers in choosing proper join methods based on their working conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16759v2-abstract-full').style.display = 'none'; document.getElementById('2501.16759v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by VLDB 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16679">arXiv:2501.16679</a> <span> [<a href="https://arxiv.org/pdf/2501.16679">pdf</a>, <a href="https://arxiv.org/format/2501.16679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Polyp-Gen: Realistic and Diverse Polyp Image Generation for Endoscopic Dataset Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiushi Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+D">Di Dong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiancong Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16679v2-abstract-short" style="display: inline;"> Automated diagnostic systems (ADS) have shown significant potential in the early detection of polyps during endoscopic examinations, thereby reducing the incidence of colorectal cancer. However, due to high annotation costs and strict privacy concerns, acquiring high-quality endoscopic images poses a considerable challenge in the development of ADS. Despite recent advancements in generating synthe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16679v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16679v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16679v2-abstract-full" style="display: none;"> Automated diagnostic systems (ADS) have shown significant potential in the early detection of polyps during endoscopic examinations, thereby reducing the incidence of colorectal cancer. However, due to high annotation costs and strict privacy concerns, acquiring high-quality endoscopic images poses a considerable challenge in the development of ADS. Despite recent advancements in generating synthetic images for dataset expansion, existing endoscopic image generation algorithms failed to accurately generate the details of polyp boundary regions and typically required medical priors to specify plausible locations and shapes of polyps, which limited the realism and diversity of the generated images. To address these limitations, we present Polyp-Gen, the first full-automatic diffusion-based endoscopic image generation framework. Specifically, we devise a spatial-aware diffusion training scheme with a lesion-guided loss to enhance the structural context of polyp boundary regions. Moreover, to capture medical priors for the localization of potential polyp areas, we introduce a hierarchical retrieval-based sampling strategy to match similar fine-grained spatial features. In this way, our Polyp-Gen can generate realistic and diverse endoscopic images for building reliable ADS. Extensive experiments demonstrate the state-of-the-art generation quality, and the synthetic images can improve the downstream polyp detection task. Additionally, our Polyp-Gen has shown remarkable zero-shot generalizability on other datasets. The source code is available at https://github.com/CUHK-AIM-Group/Polyp-Gen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16679v2-abstract-full').style.display = 'none'; document.getElementById('2501.16679v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16103">arXiv:2501.16103</a> <span> [<a href="https://arxiv.org/pdf/2501.16103">pdf</a>, <a href="https://arxiv.org/ps/2501.16103">ps</a>, <a href="https://arxiv.org/format/2501.16103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Static Batching of Irregular Workloads on GPUs: Framework and Application to Efficient MoE Model Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinghan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiejing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bujiao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaotong Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lian Duan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yejun Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wente Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yajie Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiacheng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Laiwen Zheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyuan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16103v1-abstract-short" style="display: inline;"> It has long been a problem to arrange and execute irregular workloads on massively parallel devices. We propose a general framework for statically batching irregular workloads into a single kernel with a runtime task mapping mechanism on GPUs. We further apply this framework to Mixture-of-Experts (MoE) model inference and implement an optimized and efficient CUDA kernel. Our MoE kernel achieves up… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16103v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16103v1-abstract-full" style="display: none;"> It has long been a problem to arrange and execute irregular workloads on massively parallel devices. We propose a general framework for statically batching irregular workloads into a single kernel with a runtime task mapping mechanism on GPUs. We further apply this framework to Mixture-of-Experts (MoE) model inference and implement an optimized and efficient CUDA kernel. Our MoE kernel achieves up to 91% of the peak Tensor Core throughput on NVIDIA H800 GPU and 95% on NVIDIA H20 GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16103v1-abstract-full').style.display = 'none'; document.getElementById('2501.16103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.1.3; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15427">arXiv:2501.15427</a> <span> [<a href="https://arxiv.org/pdf/2501.15427">pdf</a>, <a href="https://arxiv.org/format/2501.15427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OpenCharacter: Training Customizable Role-Playing LLMs with Large-Scale Synthetic Personas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongming Zhang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+T">Tao Ge</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15427v2-abstract-short" style="display: inline;"> Customizable role-playing in large language models (LLMs), also known as character generalization, is gaining increasing attention for its versatility and cost-efficiency in developing and deploying role-playing dialogue agents. This study explores a large-scale data synthesis approach to equip LLMs with character generalization capabilities. We begin by synthesizing large-scale character profiles… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15427v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15427v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15427v2-abstract-full" style="display: none;"> Customizable role-playing in large language models (LLMs), also known as character generalization, is gaining increasing attention for its versatility and cost-efficiency in developing and deploying role-playing dialogue agents. This study explores a large-scale data synthesis approach to equip LLMs with character generalization capabilities. We begin by synthesizing large-scale character profiles using personas from Persona Hub and then explore two strategies: response rewriting and response generation, to create character-aligned instructional responses. To validate the effectiveness of our synthetic instruction tuning data for character generalization, we perform supervised fine-tuning (SFT) using the LLaMA-3 8B model. Our best-performing model strengthens the original LLaMA-3 8B Instruct model and achieves performance comparable to GPT-4o models on role-playing dialogue. We release our synthetic characters and instruction-tuning dialogues to support public research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15427v2-abstract-full').style.display = 'none'; document.getElementById('2501.15427v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15383">arXiv:2501.15383</a> <span> [<a href="https://arxiv.org/pdf/2501.15383">pdf</a>, <a href="https://arxiv.org/format/2501.15383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Qwen2.5-1M Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyan Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiandong Jiang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jianhong Tu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kexin Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Le Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mei Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Minmin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qin Zhu</a>, <a href="/search/cs?searchtype=author&query=Men%2C+R">Rui Men</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tao He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weijia Xu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenbiao Yin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xiafei Qiu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xingzhang Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinlong Yang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15383v1-abstract-short" style="display: inline;"> We introduce Qwen2.5-1M, a series of models that extend the context length to 1 million tokens. Compared to the previous 128K version, the Qwen2.5-1M series have significantly enhanced long-context capabilities through long-context pre-training and post-training. Key techniques such as long data synthesis, progressive pre-training, and multi-stage supervised fine-tuning are employed to effectively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15383v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15383v1-abstract-full" style="display: none;"> We introduce Qwen2.5-1M, a series of models that extend the context length to 1 million tokens. Compared to the previous 128K version, the Qwen2.5-1M series have significantly enhanced long-context capabilities through long-context pre-training and post-training. Key techniques such as long data synthesis, progressive pre-training, and multi-stage supervised fine-tuning are employed to effectively enhance long-context performance while reducing training costs. To promote the use of long-context models among a broader user base, we present and open-source our inference framework. This framework includes a length extrapolation method that can expand the model context lengths by at least four times, or even more, without additional training. To reduce inference costs, we implement a sparse attention method along with chunked prefill optimization for deployment scenarios and a sparsity refinement method to improve precision. Additionally, we detail our optimizations in the inference engine, including kernel optimization, pipeline parallelism, and scheduling optimization, which significantly enhance overall inference performance. By leveraging our inference framework, the Qwen2.5-1M models achieve a remarkable 3x to 7x prefill speedup in scenarios with 1 million tokens of context. This framework provides an efficient and powerful solution for developing applications that require long-context processing using open-source models. The Qwen2.5-1M series currently includes the open-source models Qwen2.5-7B-Instruct-1M and Qwen2.5-14B-Instruct-1M, as well as the API-accessed model Qwen2.5-Turbo. Evaluations show that Qwen2.5-1M models have been greatly improved in long-context tasks without compromising performance in short-context scenarios. Specifically, the Qwen2.5-14B-Instruct-1M model significantly outperforms GPT-4o-mini in long-context tasks and supports contexts eight times longer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15383v1-abstract-full').style.display = 'none'; document.getElementById('2501.15383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13599">arXiv:2501.13599</a> <span> [<a href="https://arxiv.org/pdf/2501.13599">pdf</a>, <a href="https://arxiv.org/format/2501.13599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning under Commission and Omission Event Outliers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuecheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+G">Guanhua Fang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wen Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13599v1-abstract-short" style="display: inline;"> Event stream is an important data format in real life. The events are usually expected to follow some regular patterns over time. However, the patterns could be contaminated by unexpected absences or occurrences of events. In this paper, we adopt the temporal point process framework for learning event stream and we provide a simple-but-effective method to deal with both commission and omission eve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13599v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13599v1-abstract-full" style="display: none;"> Event stream is an important data format in real life. The events are usually expected to follow some regular patterns over time. However, the patterns could be contaminated by unexpected absences or occurrences of events. In this paper, we adopt the temporal point process framework for learning event stream and we provide a simple-but-effective method to deal with both commission and omission event outliers.In particular, we introduce a novel weight function to dynamically adjust the importance of each observed event so that the final estimator could offer multiple statistical merits. We compare the proposed method with the vanilla one in the classification problems, where event streams can be clustered into different groups. Both theoretical and numerical results confirm the effectiveness of our new approach. To our knowledge, our method is the first one to provably handle both commission and omission outliers simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13599v1-abstract-full').style.display = 'none'; document.getElementById('2501.13599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13426">arXiv:2501.13426</a> <span> [<a href="https://arxiv.org/pdf/2501.13426">pdf</a>, <a href="https://arxiv.org/format/2501.13426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Auto-Prompting SAM for Weakly Supervised Landslide Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xianping Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weikang Yu</a>, <a href="/search/cs?searchtype=author&query=Ghamisi%2C+P">Pedram Ghamisi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13426v1-abstract-short" style="display: inline;"> Weakly supervised landslide extraction aims to identify landslide regions from remote sensing data using models trained with weak labels, particularly image-level labels. However, it is often challenged by the imprecise boundaries of the extracted objects due to the lack of pixel-wise supervision and the properties of landslide objects. To tackle these issues, we propose a simple yet effective met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13426v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13426v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13426v1-abstract-full" style="display: none;"> Weakly supervised landslide extraction aims to identify landslide regions from remote sensing data using models trained with weak labels, particularly image-level labels. However, it is often challenged by the imprecise boundaries of the extracted objects due to the lack of pixel-wise supervision and the properties of landslide objects. To tackle these issues, we propose a simple yet effective method by auto-prompting the Segment Anything Model (SAM), i.e., APSAM. Instead of depending on high-quality class activation maps (CAMs) for pseudo-labeling or fine-tuning SAM, our method directly yields fine-grained segmentation masks from SAM inference through prompt engineering. Specifically, it adaptively generates hybrid prompts from the CAMs obtained by an object localization network. To provide sufficient information for SAM prompting, an adaptive prompt generation (APG) algorithm is designed to fully leverage the visual patterns of CAMs, enabling the efficient generation of pseudo-masks for landslide extraction. These informative prompts are able to identify the extent of landslide areas (box prompts) and denote the centers of landslide objects (point prompts), guiding SAM in landslide segmentation. Experimental results on high-resolution aerial and satellite datasets demonstrate the effectiveness of our method, achieving improvements of at least 3.0\% in F1 score and 3.69\% in IoU compared to other state-of-the-art methods. The source codes and datasets will be available at https://github.com/zxk688. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13426v1-abstract-full').style.display = 'none'; document.getElementById('2501.13426v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08114">arXiv:2501.08114</a> <span> [<a href="https://arxiv.org/pdf/2501.08114">pdf</a>, <a href="https://arxiv.org/format/2501.08114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Change Captioning in Remote Sensing: Evolution to SAT-Cap -- A Single-Stage Transformer Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuduo Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weikang Yu</a>, <a href="/search/cs?searchtype=author&query=Ghamisi%2C+P">Pedram Ghamisi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08114v1-abstract-short" style="display: inline;"> Change captioning has become essential for accurately describing changes in multi-temporal remote sensing data, providing an intuitive way to monitor Earth's dynamics through natural language. However, existing change captioning methods face two key challenges: high computational demands due to multistage fusion strategy, and insufficient detail in object descriptions due to limited semantic extra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08114v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08114v1-abstract-full" style="display: none;"> Change captioning has become essential for accurately describing changes in multi-temporal remote sensing data, providing an intuitive way to monitor Earth's dynamics through natural language. However, existing change captioning methods face two key challenges: high computational demands due to multistage fusion strategy, and insufficient detail in object descriptions due to limited semantic extraction from individual images. To solve these challenges, we propose SAT-Cap based on the transformers model with a single-stage feature fusion for remote sensing change captioning. In particular, SAT-Cap integrates a Spatial-Channel Attention Encoder, a Difference-Guided Fusion module, and a Caption Decoder. Compared to typical models that require multi-stage fusion in transformer encoder and fusion module, SAT-Cap uses only a simple cosine similarity-based fusion module for information integration, reducing the complexity of the model architecture. By jointly modeling spatial and channel information in Spatial-Channel Attention Encoder, our approach significantly enhances the model's ability to extract semantic information from objects in multi-temporal remote sensing images. Extensive experiments validate the effectiveness of SAT-Cap, achieving CIDEr scores of 140.23% on the LEVIR-CC dataset and 97.74% on the DUBAI-CC dataset, surpassing current state-of-the-art methods. The code and pre-trained models will be available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08114v1-abstract-full').style.display = 'none'; document.getElementById('2501.08114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08001">arXiv:2501.08001</a> <span> [<a href="https://arxiv.org/pdf/2501.08001">pdf</a>, <a href="https://arxiv.org/format/2501.08001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GDiffRetro: Retrosynthesis Prediction with Dual Graph Enhanced Molecular Representation and Diffusion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shengyin Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuxiang Ren</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Weitao Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuecang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Ying Hu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08001v1-abstract-short" style="display: inline;"> Retrosynthesis prediction focuses on identifying reactants capable of synthesizing a target product. Typically, the retrosynthesis prediction involves two phases: Reaction Center Identification and Reactant Generation. However, we argue that most existing methods suffer from two limitations in the two phases: (i) Existing models do not adequately capture the ``face'' information in molecular graph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08001v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08001v1-abstract-full" style="display: none;"> Retrosynthesis prediction focuses on identifying reactants capable of synthesizing a target product. Typically, the retrosynthesis prediction involves two phases: Reaction Center Identification and Reactant Generation. However, we argue that most existing methods suffer from two limitations in the two phases: (i) Existing models do not adequately capture the ``face'' information in molecular graphs for the reaction center identification. (ii) Current approaches for the reactant generation predominantly use sequence generation in a 2D space, which lacks versatility in generating reasonable distributions for completed reactive groups and overlooks molecules' inherent 3D properties. To overcome the above limitations, we propose GDiffRetro. For the reaction center identification, GDiffRetro uniquely integrates the original graph with its corresponding dual graph to represent molecular structures, which helps guide the model to focus more on the faces in the graph. For the reactant generation, GDiffRetro employs a conditional diffusion model in 3D to further transform the obtained synthon into a complete reactant. Our experimental findings reveal that GDiffRetro outperforms state-of-the-art semi-template models across various evaluative metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08001v1-abstract-full').style.display = 'none'; document.getElementById('2501.08001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07861">arXiv:2501.07861</a> <span> [<a href="https://arxiv.org/pdf/2501.07861">pdf</a>, <a href="https://arxiv.org/format/2501.07861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ReARTeR: Retrieval-Augmented Reasoning with Trustworthy Process Rewarding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhongxiang Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weijie Yu</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+X">Xiaoxue Zang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kai Zheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Song Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Han Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07861v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) systems for Large Language Models (LLMs) hold promise in knowledge-intensive tasks but face limitations in complex multi-step reasoning. While recent methods have integrated RAG with chain-of-thought reasoning or test-time search using Process Reward Models (PRMs), these approaches encounter challenges such as a lack of explanations, bias in PRM training data,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07861v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07861v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) systems for Large Language Models (LLMs) hold promise in knowledge-intensive tasks but face limitations in complex multi-step reasoning. While recent methods have integrated RAG with chain-of-thought reasoning or test-time search using Process Reward Models (PRMs), these approaches encounter challenges such as a lack of explanations, bias in PRM training data, early-step bias in PRM scores, and insufficient post-training optimization of reasoning potential. To address these issues, we propose Retrieval-Augmented Reasoning through Trustworthy Process Rewarding (ReARTeR), a framework that enhances RAG systems' reasoning capabilities through post-training and test-time scaling. At test time, ReARTeR introduces Trustworthy Process Rewarding via a Process Reward Model for accurate scalar scoring and a Process Explanation Model (PEM) for generating natural language explanations, enabling step refinement. During post-training, it utilizes Monte Carlo Tree Search guided by Trustworthy Process Rewarding to collect high-quality step-level preference data, optimized through Iterative Preference Optimization. ReARTeR addresses three core challenges: (1) misalignment between PRM and PEM, tackled through off-policy preference learning; (2) bias in PRM training data, mitigated by balanced annotation methods and stronger annotations for challenging examples; and (3) early-step bias in PRM, resolved through a temporal-difference-based look-ahead search strategy. Experimental results on multi-step reasoning benchmarks demonstrate significant improvements, underscoring ReARTeR's potential to advance the reasoning capabilities of RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07861v1-abstract-full').style.display = 'none'; document.getElementById('2501.07861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yu%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>