Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,557 results for author: <span class="mathjax">Li, D</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20174">arXiv:2503.20174</a> <span> [<a href="https://arxiv.org/pdf/2503.20174">pdf</a>, <a href="https://arxiv.org/format/2503.20174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Devil is in the Uniformity: Exploring Diverse Learners within Transformer for Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shihao Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dayu Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Juncheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jinglei Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jufeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20174v1-abstract-short" style="display: inline;"> Transformer-based approaches have gained significant attention in image restoration, where the core component, i.e, Multi-Head Attention (MHA), plays a crucial role in capturing diverse features and recovering high-quality results. In MHA, heads perform attention calculation independently from uniform split subspaces, and a redundancy issue is triggered to hinder the model from achieving satisfact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20174v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20174v1-abstract-full" style="display: none;"> Transformer-based approaches have gained significant attention in image restoration, where the core component, i.e, Multi-Head Attention (MHA), plays a crucial role in capturing diverse features and recovering high-quality results. In MHA, heads perform attention calculation independently from uniform split subspaces, and a redundancy issue is triggered to hinder the model from achieving satisfactory outputs. In this paper, we propose to improve MHA by exploring diverse learners and introducing various interactions between heads, which results in a Hierarchical multI-head atteNtion driven Transformer model, termed HINT, for image restoration. HINT contains two modules, i.e., the Hierarchical Multi-Head Attention (HMHA) and the Query-Key Cache Updating (QKCU) module, to address the redundancy problem that is rooted in vanilla MHA. Specifically, HMHA extracts diverse contextual features by employing heads to learn from subspaces of varying sizes and containing different information. Moreover, QKCU, comprising intra- and inter-layer schemes, further reduces the redundancy problem by facilitating enhanced interactions between attention heads within and across layers. Extensive experiments are conducted on 12 benchmarks across 5 image restoration tasks, including low-light enhancement, dehazing, desnowing, denoising, and deraining, to demonstrate the superiority of HINT. The source code is available in the supplementary materials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20174v1-abstract-full').style.display = 'none'; document.getElementById('2503.20174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19404">arXiv:2503.19404</a> <span> [<a href="https://arxiv.org/pdf/2503.19404">pdf</a>, <a href="https://arxiv.org/format/2503.19404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LangBridge: Interpreting Image as a Combination of Language Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yuwei Niu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanqing Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Changyao Tian</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yinuo Du</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuwen Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dianqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19404v2-abstract-short" style="display: inline;"> Recent years have witnessed remarkable advances in Large Vision-Language Models (LVLMs), which have achieved human-level performance across various complex vision-language tasks. Following LLaVA's paradigm, mainstream LVLMs typically employ a shallow MLP for visual-language alignment through a two-stage training process: pretraining for cross-modal alignment followed by instruction tuning. While t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19404v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19404v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19404v2-abstract-full" style="display: none;"> Recent years have witnessed remarkable advances in Large Vision-Language Models (LVLMs), which have achieved human-level performance across various complex vision-language tasks. Following LLaVA's paradigm, mainstream LVLMs typically employ a shallow MLP for visual-language alignment through a two-stage training process: pretraining for cross-modal alignment followed by instruction tuning. While this approach has proven effective, the underlying mechanisms of how MLPs bridge the modality gap remain poorly understood. Although some research has explored how LLMs process transformed visual tokens, few studies have investigated the fundamental alignment mechanism. Furthermore, the MLP adapter requires retraining whenever switching LLM backbones. To address these limitations, we first investigate the working principles of MLP adapters and discover that they learn to project visual embeddings into subspaces spanned by corresponding text embeddings progressively. Based on this insight, we propose LangBridge, a novel adapter that explicitly maps visual tokens to linear combinations of LLM vocabulary embeddings. This innovative design enables pretraining-free adapter transfer across different LLMs while maintaining performance. Our experimental results demonstrate that a LangBridge adapter pre-trained on Qwen2-0.5B can be directly applied to larger models such as LLaMA3-8B or Qwen2.5-14B while maintaining competitive performance. Overall, LangBridge enables interpretable vision-language alignment by grounding visual representations in LLM vocab embedding, while its plug-and-play design ensures efficient reuse across multiple LLMs with nearly no performance degradation. See our project page at https://jiaqiliao77.github.io/LangBridge.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19404v2-abstract-full').style.display = 'none'; document.getElementById('2503.19404v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and weights will be open-sourced. Project page: https://jiaqiliao77.github.io/LangBridge.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19386">arXiv:2503.19386</a> <span> [<a href="https://arxiv.org/pdf/2503.19386">pdf</a>, <a href="https://arxiv.org/format/2503.19386">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Exploring Textual Semantics Diversity for Image Transmission in Semantic Communication Systems using Visual Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peishan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19386v1-abstract-short" style="display: inline;"> In recent years, the rapid development of machine learning has brought reforms and challenges to traditional communication systems. Semantic communication has appeared as an effective strategy to effectively extract relevant semantic signals semantic segmentation labels and image features for image transmission. However, the insufficient number of extracted semantic features of images will potenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19386v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19386v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19386v1-abstract-full" style="display: none;"> In recent years, the rapid development of machine learning has brought reforms and challenges to traditional communication systems. Semantic communication has appeared as an effective strategy to effectively extract relevant semantic signals semantic segmentation labels and image features for image transmission. However, the insufficient number of extracted semantic features of images will potentially result in a low reconstruction accuracy, which hinders the practical applications and still remains challenging for solving. In order to fill this gap, this letter proposes a multi-text transmission semantic communication (Multi-SC) system, which uses the visual language model (VLM) to assist in the transmission of image semantic signals. Unlike previous image transmission semantic communication systems, the proposed system divides the image into multiple blocks and extracts multiple text information from the image using a modified large language and visual assistant (LLaVA), and combines semantic segmentation tags with semantic text for image recovery. Simulation results show that the proposed text semantics diversity scheme can significantly improve the reconstruction accuracy compared with related works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19386v1-abstract-full').style.display = 'none'; document.getElementById('2503.19386v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19312">arXiv:2503.19312</a> <span> [<a href="https://arxiv.org/pdf/2503.19312">pdf</a>, <a href="https://arxiv.org/format/2503.19312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ImageGen-CoT: Enhancing Text-to-Image In-context Learning with Chain-of-Thought Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dianqi Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19312v1-abstract-short" style="display: inline;"> In this work, we study the problem of Text-to-Image In-Context Learning (T2I-ICL). While Unified Multimodal LLMs (MLLMs) have advanced rapidly in recent years, they struggle with contextual reasoning in T2I-ICL scenarios. To address this limitation, we propose a novel framework that incorporates a thought process called ImageGen-CoT prior to image generation. To avoid generating unstructured ineff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19312v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19312v1-abstract-full" style="display: none;"> In this work, we study the problem of Text-to-Image In-Context Learning (T2I-ICL). While Unified Multimodal LLMs (MLLMs) have advanced rapidly in recent years, they struggle with contextual reasoning in T2I-ICL scenarios. To address this limitation, we propose a novel framework that incorporates a thought process called ImageGen-CoT prior to image generation. To avoid generating unstructured ineffective reasoning steps, we develop an automatic pipeline to curate a high-quality ImageGen-CoT dataset. We then fine-tune MLLMs using this dataset to enhance their contextual reasoning capabilities. To further enhance performance, we explore test-time scale-up strategies and propose a novel hybrid scaling approach. This approach first generates multiple ImageGen-CoT chains and then produces multiple images for each chain via sampling. Extensive experiments demonstrate the effectiveness of our proposed method. Notably, fine-tuning with the ImageGen-CoT dataset leads to a substantial 80\% performance gain for SEED-X on T2I-ICL tasks. See our project page at https://ImageGen-CoT.github.io/. Code and model weights will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19312v1-abstract-full').style.display = 'none'; document.getElementById('2503.19312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ImageGen-CoT.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19288">arXiv:2503.19288</a> <span> [<a href="https://arxiv.org/pdf/2503.19288">pdf</a>, <a href="https://arxiv.org/ps/2503.19288">ps</a>, <a href="https://arxiv.org/format/2503.19288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Novel Underwater Vehicle With Orientation Adjustable Thrusters: Design and Adaptive Tracking Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+S">Shihan Kong</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Z">Zhanhua Xin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kaiwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongyue Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Junzhi Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19288v1-abstract-short" style="display: inline;"> Autonomous underwater vehicles (AUVs) are essential for marine exploration and research. However, conventional designs often struggle with limited maneuverability in complex, dynamic underwater environments. This paper introduces an innovative orientation-adjustable thruster AUV (OATAUV), equipped with a redundant vector thruster configuration that enables full six-degree-of-freedom (6-DOF) motion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19288v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19288v1-abstract-full" style="display: none;"> Autonomous underwater vehicles (AUVs) are essential for marine exploration and research. However, conventional designs often struggle with limited maneuverability in complex, dynamic underwater environments. This paper introduces an innovative orientation-adjustable thruster AUV (OATAUV), equipped with a redundant vector thruster configuration that enables full six-degree-of-freedom (6-DOF) motion and composite maneuvers. To overcome challenges associated with uncertain model parameters and environmental disturbances, a novel feedforward adaptive model predictive controller (FFAMPC) is proposed to ensure robust trajectory tracking, which integrates real-time state feedback with adaptive parameter updates. Extensive experiments, including closed-loop tracking and composite motion tests in a laboratory pool, validate the enhanced performance of the OAT-AUV. The results demonstrate that the OAT-AUV's redundant vector thruster configuration enables 23.8% cost reduction relative to common vehicles, while the FF-AMPC controller achieves 68.6% trajectory tracking improvement compared to PID controllers. Uniquely, the system executes composite helical/spiral trajectories unattainable by similar vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19288v1-abstract-full').style.display = 'none'; document.getElementById('2503.19288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19201">arXiv:2503.19201</a> <span> [<a href="https://arxiv.org/pdf/2503.19201">pdf</a>, <a href="https://arxiv.org/format/2503.19201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Shared Low-Rank Adaptation Approach to Personalized RLHF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Renpu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Donghao Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19201v1-abstract-short" style="display: inline;"> Reinforcement Learning from Human Feedback (RLHF) has emerged as a pivotal technique for aligning artificial intelligence systems with human values, achieving remarkable success in fine-tuning large language models. However, existing RLHF frameworks often assume that human preferences are relatively homogeneous and can be captured by a single, unified reward model. This assumption overlooks the in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19201v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19201v1-abstract-full" style="display: none;"> Reinforcement Learning from Human Feedback (RLHF) has emerged as a pivotal technique for aligning artificial intelligence systems with human values, achieving remarkable success in fine-tuning large language models. However, existing RLHF frameworks often assume that human preferences are relatively homogeneous and can be captured by a single, unified reward model. This assumption overlooks the inherent diversity and heterogeneity across individuals, limiting the adaptability of RLHF to personalized scenarios and risking misalignments that can diminish user satisfaction and trust in AI systems. In this paper, we address these challenges by introducing Low-Rank Adaptation (LoRA) into the personalized RLHF framework. We apply LoRA in the the aggregated parameter space of all personalized reward functions, thereby enabling efficient learning of personalized reward models from potentially limited local datasets. Our approach exploits potential shared structures among the local ground-truth reward models while allowing for individual adaptation, without relying on restrictive assumptions about shared representations as in prior works. We further establish sample complexity guarantees for our method. Theoretical analysis demonstrates the effectiveness of the proposed approach in capturing both shared and individual-specific structures within heterogeneous human preferences, addressing the dual challenge of personalization requirements and practical data constraints. Experimental results on real-world datasets corroborate the efficiency of our algorithm in the personalized RLHF setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19201v1-abstract-full').style.display = 'none'; document.getElementById('2503.19201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at AISTATS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18888">arXiv:2503.18888</a> <span> [<a href="https://arxiv.org/pdf/2503.18888">pdf</a>, <a href="https://arxiv.org/format/2503.18888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Toward building next-generation Geocoding systems: a systematic review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhengcong Yin</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+D+W">Daniel W. Goldberg</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bing Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Diya Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+A">Andong Ma</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+Z">Ziqian Ming</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Heng Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaohua Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shanzhen Gao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J+Y">Joey Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+D">Da Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18888v1-abstract-short" style="display: inline;"> Geocoding systems are widely used in both scientific research for spatial analysis and everyday life through location-based services. The quality of geocoded data significantly impacts subsequent processes and applications, underscoring the need for next-generation systems. In response to this demand, this review first examines the evolving requirements for geocoding inputs and outputs across vari… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18888v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18888v1-abstract-full" style="display: none;"> Geocoding systems are widely used in both scientific research for spatial analysis and everyday life through location-based services. The quality of geocoded data significantly impacts subsequent processes and applications, underscoring the need for next-generation systems. In response to this demand, this review first examines the evolving requirements for geocoding inputs and outputs across various scenarios these systems must address. It then provides a detailed analysis of how to construct such systems by breaking them down into key functional components and reviewing a broad spectrum of existing approaches, from traditional rule-based methods to advanced techniques in information retrieval, natural language processing, and large language models. Finally, we identify opportunities to improve next-generation geocoding systems in light of recent technological advances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18888v1-abstract-full').style.display = 'none'; document.getElementById('2503.18888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18865">arXiv:2503.18865</a> <span> [<a href="https://arxiv.org/pdf/2503.18865">pdf</a>, <a href="https://arxiv.org/format/2503.18865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Structuring Scientific Innovation: A Framework for Modeling and Discovering Impactful Knowledge Combinations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junlan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Daifeng Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yangyang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+B">Bowen Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18865v2-abstract-short" style="display: inline;"> The emergence of large language models offers new possibilities for structured exploration of scientific knowledge. Rather than viewing scientific discovery as isolated ideas or content, we propose a structured approach that emphasizes the role of method combinations in shaping disruptive insights. Specifically, we investigate how knowledge unit--especially those tied to methodological design--can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18865v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18865v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18865v2-abstract-full" style="display: none;"> The emergence of large language models offers new possibilities for structured exploration of scientific knowledge. Rather than viewing scientific discovery as isolated ideas or content, we propose a structured approach that emphasizes the role of method combinations in shaping disruptive insights. Specifically, we investigate how knowledge unit--especially those tied to methodological design--can be modeled and recombined to yield research breakthroughs. Our proposed framework addresses two key challenges. First, we introduce a contrastive learning-based mechanism to identify distinguishing features of historically disruptive method combinations within problem-driven contexts. Second, we propose a reasoning-guided Monte Carlo search algorithm that leverages the chain-of-thought capability of LLMs to identify promising knowledge recombinations for new problem statements.Empirical studies across multiple domains show that the framework is capable of modeling the structural dynamics of innovation and successfully highlights combinations with high disruptive potential. This research provides a new path for computationally guided scientific ideation grounded in structured reasoning and historical data modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18865v2-abstract-full').style.display = 'none'; document.getElementById('2503.18865v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18680">arXiv:2503.18680</a> <span> [<a href="https://arxiv.org/pdf/2503.18680">pdf</a>, <a href="https://arxiv.org/format/2503.18680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ArchSeek: Retrieving Architectural Case Studies Using Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Danrui Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichao Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaluo Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Ziying Shi</a>, <a href="/search/cs?searchtype=author&query=Kapadia%2C+M">Mubbasir Kapadia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18680v1-abstract-short" style="display: inline;"> Efficiently searching for relevant case studies is critical in architectural design, as designers rely on precedent examples to guide or inspire their ongoing projects. However, traditional text-based search tools struggle to capture the inherently visual and complex nature of architectural knowledge, often leading to time-consuming and imprecise exploration. This paper introduces ArchSeek, an inn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18680v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18680v1-abstract-full" style="display: none;"> Efficiently searching for relevant case studies is critical in architectural design, as designers rely on precedent examples to guide or inspire their ongoing projects. However, traditional text-based search tools struggle to capture the inherently visual and complex nature of architectural knowledge, often leading to time-consuming and imprecise exploration. This paper introduces ArchSeek, an innovative case study search system with recommendation capability, tailored for architecture design professionals. Powered by the visual understanding capabilities from vision-language models and cross-modal embeddings, it enables text and image queries with fine-grained control, and interaction-based design case recommendations. It offers architects a more efficient, personalized way to discover design inspirations, with potential applications across other visually driven design fields. The source code is available at https://github.com/danruili/ArchSeek. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18680v1-abstract-full').style.display = 'none'; document.getElementById('2503.18680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 8 figures, 3 tables. Accepted by CAAD Futures 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18672">arXiv:2503.18672</a> <span> [<a href="https://arxiv.org/pdf/2503.18672">pdf</a>, <a href="https://arxiv.org/format/2503.18672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Feature Calibration enhanced Parameter Synthesis for CLIP-based Class-incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Juncen Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaoguang Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lianlong Sun</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+L">Liangyu Teng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Di Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Liang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18672v2-abstract-short" style="display: inline;"> Class-incremental Learning (CIL) enables models to continuously learn new class knowledge while memorizing previous classes, facilitating their adaptation and evolution in dynamic environments. Traditional CIL methods are mainly based on visual features, which limits their ability to handle complex scenarios. In contrast, Vision-Language Models (VLMs) show promising potential to promote CIL by int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18672v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18672v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18672v2-abstract-full" style="display: none;"> Class-incremental Learning (CIL) enables models to continuously learn new class knowledge while memorizing previous classes, facilitating their adaptation and evolution in dynamic environments. Traditional CIL methods are mainly based on visual features, which limits their ability to handle complex scenarios. In contrast, Vision-Language Models (VLMs) show promising potential to promote CIL by integrating pretrained knowledge with textual features. However, previous methods make it difficult to overcome catastrophic forgetting while preserving the generalization capabilities of VLMs. To tackle these challenges, we propose Feature Calibration enhanced Parameter Synthesis (FCPS) in this paper. Specifically, our FCPS employs a specific parameter adjustment mechanism to iteratively refine the proportion of original visual features participating in the final class determination, ensuring the model's foundational generalization capabilities. Meanwhile, parameter integration across different tasks achieves a balance between learning new class knowledge and retaining old knowledge. Experimental results on popular benchmarks (e.g., CIFAR100 and ImageNet100) validate the superiority of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18672v2-abstract-full').style.display = 'none'; document.getElementById('2503.18672v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18559">arXiv:2503.18559</a> <span> [<a href="https://arxiv.org/pdf/2503.18559">pdf</a>, <a href="https://arxiv.org/format/2503.18559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AMD-Hummingbird: Towards an Efficient Text-to-Video Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Isobe%2C+T">Takashi Isobe</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">He Cui</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dong Zhou</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+M">Mengmeng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Barsoum%2C+E">Emad Barsoum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18559v2-abstract-short" style="display: inline;"> Text-to-Video (T2V) generation has attracted significant attention for its ability to synthesize realistic videos from textual descriptions. However, existing models struggle to balance computational efficiency and high visual quality, particularly on resource-limited devices, e.g.,iGPUs and mobile phones. Most prior work prioritizes visual fidelity while overlooking the need for smaller, more eff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18559v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18559v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18559v2-abstract-full" style="display: none;"> Text-to-Video (T2V) generation has attracted significant attention for its ability to synthesize realistic videos from textual descriptions. However, existing models struggle to balance computational efficiency and high visual quality, particularly on resource-limited devices, e.g.,iGPUs and mobile phones. Most prior work prioritizes visual fidelity while overlooking the need for smaller, more efficient models suitable for real-world deployment. To address this challenge, we propose a lightweight T2V framework, termed Hummingbird, which prunes existing models and enhances visual quality through visual feedback learning. Our approach reduces the size of the U-Net from 1.4 billion to 0.7 billion parameters, significantly improving efficiency while preserving high-quality video generation. Additionally, we introduce a novel data processing pipeline that leverages Large Language Models (LLMs) and Video Quality Assessment (VQA) models to enhance the quality of both text prompts and video data. To support user-driven training and style customization, we publicly release the full training code, including data processing and model training. Extensive experiments show that our method achieves a 31X speedup compared to state-of-the-art models such as VideoCrafter2, while also attaining the highest overall score on VBench. Moreover, our method supports the generation of videos with up to 26 frames, addressing the limitations of existing U-Net-based methods in long video generation. Notably, the entire training process requires only four GPUs, yet delivers performance competitive with existing leading methods. Hummingbird presents a practical and efficient solution for T2V generation, combining high performance, scalability, and flexibility for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18559v2-abstract-full').style.display = 'none'; document.getElementById('2503.18559v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Homepage: https://www.amd.com/en/developer/resources/technical-articles/amd-hummingbird-0-9b-text-to-video-diffusion-model-with-4-step-inferencing.html| GitHub: https://github.com/AMD-AIG-AIMA/AMD-Hummingbird-T2V</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17101">arXiv:2503.17101</a> <span> [<a href="https://arxiv.org/pdf/2503.17101">pdf</a>, <a href="https://arxiv.org/format/2503.17101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Compression via the Nested Activation-Aware Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jun Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bill Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">David Li</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yu Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17101v1-abstract-short" style="display: inline;"> In this paper, we tackle the critical challenge of compressing large language models (LLMs) to facilitate their practical deployment and broader adoption. We introduce a novel post-training compression paradigm that focuses on low-rank decomposition of LLM weights. Our analysis identifies two main challenges in this task: the variability in LLM activation distributions and handling unseen activati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17101v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17101v1-abstract-full" style="display: none;"> In this paper, we tackle the critical challenge of compressing large language models (LLMs) to facilitate their practical deployment and broader adoption. We introduce a novel post-training compression paradigm that focuses on low-rank decomposition of LLM weights. Our analysis identifies two main challenges in this task: the variability in LLM activation distributions and handling unseen activations from different datasets and models. To address these challenges, we propose a nested activation-aware framework (NSVD) for LLMs, a training-free approach designed to enhance the accuracy of low-rank decompositions by managing activation outliers through transforming the weight matrix based on activation distribution and the original weight matrix. This method allows for the absorption of outliers into the transformed weight matrix, improving decomposition accuracy. Our comprehensive evaluation across eight datasets and six models from three distinct LLM families demonstrates the superiority of NSVD over current state-of-the-art methods, especially at medium to large compression ratios or in multilingual and multitask settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17101v1-abstract-full').style.display = 'none'; document.getElementById('2503.17101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16828">arXiv:2503.16828</a> <span> [<a href="https://arxiv.org/pdf/2503.16828">pdf</a>, <a href="https://arxiv.org/format/2503.16828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Expressive Public Key Authenticated Encryption with Keyword Search in Multi-user Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiayin Cai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xingwen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dexin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hui Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+K">Kai Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16828v1-abstract-short" style="display: inline;"> Public key authenticated encryption with keyword search (PAEKS) represents a significant advancement of secure and searchable data sharing in public network systems, such as medical systems. It can effectively mitigate the risk of keyword guessing attacks (KGA), which is a critical issue in public key encryption with keyword search (PEKS). However, in scenarios with a large number of users, the en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16828v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16828v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16828v1-abstract-full" style="display: none;"> Public key authenticated encryption with keyword search (PAEKS) represents a significant advancement of secure and searchable data sharing in public network systems, such as medical systems. It can effectively mitigate the risk of keyword guessing attacks (KGA), which is a critical issue in public key encryption with keyword search (PEKS). However, in scenarios with a large number of users, the enforced point-to-point access control necessitates that the data sender encrypt the same keyword using the public keys of multiple receivers to create indexes, while the data receiver also must generate trapdoors of size linear to senders in the system. The burden on users aiming for efficient data sharing is considerable, as the overheads increase linearly with the number of users. Furthermore, the majority of current PAEKS schemes lack expressive search functions, including conjunctions, disjunctions, or any monotone boolean formulas, which are prevalent in practical applications. To tackle the abovementioned challenges, we propose an efficient and expressive PAEKS scheme. In efficiency, one auxiliary server is integrated to assist users in generating indexes and trapdoors. Users encrypt with their respective private keys along with the public keys of the servers, facilitating secure and searchable data sharing while significantly minimizing overhead. Additionally, the LSSS is employed to implement expressive search, including monotone boolean queries. We also obfuscate the mapping relationship associated with the LSSS matrix to the keywords, thereby enhancing the privacy protection. Security analysis alongside theoretical and experimental evaluations of our scheme illustrates its practicality and efficiency in multi-user data sharing scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16828v1-abstract-full').style.display = 'none'; document.getElementById('2503.16828v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16402">arXiv:2503.16402</a> <span> [<a href="https://arxiv.org/pdf/2503.16402">pdf</a>, <a href="https://arxiv.org/format/2503.16402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Emperor's New Clothes in Benchmarking? A Rigorous Examination of Mitigation Strategies for LLM Benchmark Data Contamination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yifan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongbai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16402v1-abstract-short" style="display: inline;"> Benchmark Data Contamination (BDC)-the inclusion of benchmark testing samples in the training set-has raised increasing concerns in Large Language Model (LLM) evaluation, leading to falsely inflated performance estimates and undermining evaluation reliability. To address this, researchers have proposed various mitigation strategies to update existing benchmarks, including modifying original questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16402v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16402v1-abstract-full" style="display: none;"> Benchmark Data Contamination (BDC)-the inclusion of benchmark testing samples in the training set-has raised increasing concerns in Large Language Model (LLM) evaluation, leading to falsely inflated performance estimates and undermining evaluation reliability. To address this, researchers have proposed various mitigation strategies to update existing benchmarks, including modifying original questions or generating new ones based on them. However, a rigorous examination of the effectiveness of these mitigation strategies remains lacking. In this paper, we design a systematic and controlled pipeline along with two novel metrics-fidelity and contamination resistance-to provide a fine-grained and comprehensive assessment of existing BDC mitigation strategies. Previous assessment methods, such as accuracy drop and accuracy matching, focus solely on aggregate accuracy, often leading to incomplete or misleading conclusions. Our metrics address this limitation by emphasizing question-level evaluation result matching. Extensive experiments with 10 LLMs, 5 benchmarks, 20 BDC mitigation strategies, and 2 contamination scenarios reveal that no existing strategy significantly improves resistance over the vanilla case (i.e., no benchmark update) across all benchmarks, and none effectively balances fidelity and contamination resistance. These findings underscore the urgent need for designing more effective BDC mitigation strategies. Our code repository is available at https://github.com/ASTRAL-Group/BDC_mitigation_assessment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16402v1-abstract-full').style.display = 'none'; document.getElementById('2503.16402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15475">arXiv:2503.15475</a> <span> [<a href="https://arxiv.org/pdf/2503.15475">pdf</a>, <a href="https://arxiv.org/format/2503.15475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cube: A Roblox View of 3D Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Foundation+AI+Team"> Foundation AI Team</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+K">Kiran Bhat</a>, <a href="/search/cs?searchtype=author&query=Khanna%2C+N">Nishchaie Khanna</a>, <a href="/search/cs?searchtype=author&query=Channa%2C+K">Karun Channa</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tinghui Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaoxia Sun</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+C">Charles Shang</a>, <a href="/search/cs?searchtype=author&query=Sudarshan%2C+A">Anirudh Sudarshan</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+M">Maurice Chu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Daiqing Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kangle Deng</a>, <a href="/search/cs?searchtype=author&query=Fauconnier%2C+J">Jean-Philippe Fauconnier</a>, <a href="/search/cs?searchtype=author&query=Verhulsdonck%2C+T">Tijmen Verhulsdonck</a>, <a href="/search/cs?searchtype=author&query=Agrawala%2C+M">Maneesh Agrawala</a>, <a href="/search/cs?searchtype=author&query=Fatahalian%2C+K">Kayvon Fatahalian</a>, <a href="/search/cs?searchtype=author&query=Weiss%2C+A">Alexander Weiss</a>, <a href="/search/cs?searchtype=author&query=Reiser%2C+C">Christian Reiser</a>, <a href="/search/cs?searchtype=author&query=Chirravuri%2C+R+K">Ravi Kiran Chirravuri</a>, <a href="/search/cs?searchtype=author&query=Kandur%2C+R">Ravali Kandur</a>, <a href="/search/cs?searchtype=author&query=Pelaez%2C+A">Alejandro Pelaez</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+A">Akash Garg</a>, <a href="/search/cs?searchtype=author&query=Palleschi%2C+M">Michael Palleschi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jessica Wang</a>, <a href="/search/cs?searchtype=author&query=Litz%2C+S">Skylar Litz</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15475v1-abstract-short" style="display: inline;"> Foundation models trained on vast amounts of data have demonstrated remarkable reasoning and generation capabilities in the domains of text, images, audio and video. Our goal at Roblox is to build such a foundation model for 3D intelligence, a model that can support developers in producing all aspects of a Roblox experience, from generating 3D objects and scenes to rigging characters for animation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15475v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15475v1-abstract-full" style="display: none;"> Foundation models trained on vast amounts of data have demonstrated remarkable reasoning and generation capabilities in the domains of text, images, audio and video. Our goal at Roblox is to build such a foundation model for 3D intelligence, a model that can support developers in producing all aspects of a Roblox experience, from generating 3D objects and scenes to rigging characters for animation to producing programmatic scripts describing object behaviors. We discuss three key design requirements for such a 3D foundation model and then present our first step towards building such a model. We expect that 3D geometric shapes will be a core data type and describe our solution for 3D shape tokenizer. We show how our tokenization scheme can be used in applications for text-to-shape generation, shape-to-text generation and text-to-scene generation. We demonstrate how these applications can collaborate with existing large language models (LLMs) to perform scene analysis and reasoning. We conclude with a discussion outlining our path to building a fully unified foundation model for 3D intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15475v1-abstract-full').style.display = 'none'; document.getElementById('2503.15475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code and model weights can be found at: https://github.com/Roblox/cube</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15234">arXiv:2503.15234</a> <span> [<a href="https://arxiv.org/pdf/2503.15234">pdf</a>, <a href="https://arxiv.org/format/2503.15234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoE: Chain-of-Explanation via Automatic Visual Concept Circuit Description and Polysemanticity Quantification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenlong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chuang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15234v1-abstract-short" style="display: inline;"> Explainability is a critical factor influencing the wide deployment of deep vision models (DVMs). Concept-based post-hoc explanation methods can provide both global and local insights into model decisions. However, current methods in this field face challenges in that they are inflexible to automatically construct accurate and sufficient linguistic explanations for global concepts and local circui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15234v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15234v1-abstract-full" style="display: none;"> Explainability is a critical factor influencing the wide deployment of deep vision models (DVMs). Concept-based post-hoc explanation methods can provide both global and local insights into model decisions. However, current methods in this field face challenges in that they are inflexible to automatically construct accurate and sufficient linguistic explanations for global concepts and local circuits. Particularly, the intrinsic polysemanticity in semantic Visual Concepts (VCs) impedes the interpretability of concepts and DVMs, which is underestimated severely. In this paper, we propose a Chain-of-Explanation (CoE) approach to address these issues. Specifically, CoE automates the decoding and description of VCs to construct global concept explanation datasets. Further, to alleviate the effect of polysemanticity on model explainability, we design a concept polysemanticity disentanglement and filtering mechanism to distinguish the most contextually relevant concept atoms. Besides, a Concept Polysemanticity Entropy (CPE), as a measure of model interpretability, is formulated to quantify the degree of concept uncertainty. The modeling of deterministic concepts is upgraded to uncertain concept atom distributions. Finally, CoE automatically enables linguistic local explanations of the decision-making process of DVMs by tracing the concept circuit. GPT-4o and human-based experiments demonstrate the effectiveness of CPE and the superiority of CoE, achieving an average absolute improvement of 36% in terms of explainability scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15234v1-abstract-full').style.display = 'none'; document.getElementById('2503.15234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13946">arXiv:2503.13946</a> <span> [<a href="https://arxiv.org/pdf/2503.13946">pdf</a>, <a href="https://arxiv.org/format/2503.13946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is Discretization Fusion All You Need for Collaborative Perception? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kang Yang</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+T">Tianci Bu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lantao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunxu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongcai Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Deying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13946v1-abstract-short" style="display: inline;"> Collaborative perception in multi-agent system enhances overall perceptual capabilities by facilitating the exchange of complementary information among agents. Current mainstream collaborative perception methods rely on discretized feature maps to conduct fusion, which however, lacks flexibility in extracting and transmitting the informative features and can hardly focus on the informative feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13946v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13946v1-abstract-full" style="display: none;"> Collaborative perception in multi-agent system enhances overall perceptual capabilities by facilitating the exchange of complementary information among agents. Current mainstream collaborative perception methods rely on discretized feature maps to conduct fusion, which however, lacks flexibility in extracting and transmitting the informative features and can hardly focus on the informative features during fusion. To address these problems, this paper proposes a novel Anchor-Centric paradigm for Collaborative Object detection (ACCO). It avoids grid precision issues and allows more flexible and efficient anchor-centric communication and fusion. ACCO is composed by three main components: (1) Anchor featuring block (AFB) that targets to generate anchor proposals and projects prepared anchor queries to image features. (2) Anchor confidence generator (ACG) is designed to minimize communication by selecting only the features in the confident anchors to transmit. (3) A local-global fusion module, in which local fusion is anchor alignment-based fusion (LAAF) and global fusion is conducted by spatial-aware cross-attention (SACA). LAAF and SACA run in multi-layers, so agents conduct anchor-centric fusion iteratively to adjust the anchor proposals. Comprehensive experiments are conducted to evaluate ACCO on OPV2V and Dair-V2X datasets, which demonstrate ACCO's superiority in reducing the communication volume, and in improving the perception range and detection performances. Code can be found at: \href{https://github.com/sidiangongyuan/ACCO}{https://github.com/sidiangongyuan/ACCO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13946v1-abstract-full').style.display = 'none'; document.getElementById('2503.13946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13358">arXiv:2503.13358</a> <span> [<a href="https://arxiv.org/pdf/2503.13358">pdf</a>, <a href="https://arxiv.org/format/2503.13358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-Step Residual Shifting Diffusion for Image Super-Resolution via Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Selikhanovych%2C+D">Daniil Selikhanovych</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">David Li</a>, <a href="/search/cs?searchtype=author&query=Leonov%2C+A">Aleksei Leonov</a>, <a href="/search/cs?searchtype=author&query=Gushchin%2C+N">Nikita Gushchin</a>, <a href="/search/cs?searchtype=author&query=Kushneriuk%2C+S">Sergei Kushneriuk</a>, <a href="/search/cs?searchtype=author&query=Filippov%2C+A">Alexander Filippov</a>, <a href="/search/cs?searchtype=author&query=Burnaev%2C+E">Evgeny Burnaev</a>, <a href="/search/cs?searchtype=author&query=Koshelev%2C+I">Iaroslav Koshelev</a>, <a href="/search/cs?searchtype=author&query=Korotin%2C+A">Alexander Korotin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13358v1-abstract-short" style="display: inline;"> Diffusion models for super-resolution (SR) produce high-quality visual results but require expensive computational costs. Despite the development of several methods to accelerate diffusion-based SR models, some (e.g., SinSR) fail to produce realistic perceptual details, while others (e.g., OSEDiff) may hallucinate non-existent structures. To overcome these issues, we present RSD, a new distillatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13358v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13358v1-abstract-full" style="display: none;"> Diffusion models for super-resolution (SR) produce high-quality visual results but require expensive computational costs. Despite the development of several methods to accelerate diffusion-based SR models, some (e.g., SinSR) fail to produce realistic perceptual details, while others (e.g., OSEDiff) may hallucinate non-existent structures. To overcome these issues, we present RSD, a new distillation method for ResShift, one of the top diffusion-based SR models. Our method is based on training the student network to produce such images that a new fake ResShift model trained on them will coincide with the teacher model. RSD achieves single-step restoration and outperforms the teacher by a large margin. We show that our distillation method can surpass the other distillation-based method for ResShift - SinSR - making it on par with state-of-the-art diffusion-based SR distillation methods. Compared to SR methods based on pre-trained text-to-image models, RSD produces competitive perceptual quality, provides images with better alignment to degraded input images, and requires fewer parameters and GPU memory. We provide experimental results on various real-world and synthetic datasets, including RealSR, RealSet65, DRealSR, ImageNet, and DIV2K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13358v1-abstract-full').style.display = 'none'; document.getElementById('2503.13358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12871">arXiv:2503.12871</a> <span> [<a href="https://arxiv.org/pdf/2503.12871">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Reference Architecture for Autonomous Networks: An Agent-Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sifakis%2C+J">Joseph Sifakis</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongming Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hairong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+W">Wenshuan Dang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">River Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yijun Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12871v3-abstract-short" style="display: inline;"> The vision of autonomous systems is becoming increasingly important in many application areas, where the aim is to replace humans with agents. These include autonomous vehicles and other agents' applications in business processes and problem-solving. For networks, the increasing scale and operation and management (O&M) complexity drive the need for autonomous networks (AN). The technical objective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12871v3-abstract-full').style.display = 'inline'; document.getElementById('2503.12871v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12871v3-abstract-full" style="display: none;"> The vision of autonomous systems is becoming increasingly important in many application areas, where the aim is to replace humans with agents. These include autonomous vehicles and other agents' applications in business processes and problem-solving. For networks, the increasing scale and operation and management (O&M) complexity drive the need for autonomous networks (AN). The technical objective of AN is to ensure trustworthy O&M without human intervention for higher efficiency and lower operating costs. However, realizing AN seems more difficult than autonomous vehicles. It encounters challenges of networks' structural and functional complexity, which operate as distributed dynamic systems governed by various technical and economic constraints. A key problem lies in formulating a rigorous development methodology that facilitates a seamless transition from traditional networks to AN. Central to this methodology is the definition of a reference architecture for network agents, which specifies the required functionalities for their realization, regardless of implementation choices. This article proposes a reference architecture characterizing main functional features, illustrating its application with network use cases. It shows how artificial intelligence components can be used to implement the required functionality and its coordination. The latter is achieved through the management and generation of shared domain-specific knowledge stored in long-term memory, ensuring the overall consistency of decisions and their execution. The article concludes with a discussion of architecture specialization for building network layer agents. It also identifies the main technical challenges ahead, such as satisfying essential requirements at development or runtime, as well as the issue of coordinating agents to achieve collective intelligence in meeting overall network goals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12871v3-abstract-full').style.display = 'none'; document.getElementById('2503.12871v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12764">arXiv:2503.12764</a> <span> [<a href="https://arxiv.org/pdf/2503.12764">pdf</a>, <a href="https://arxiv.org/format/2503.12764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decouple to Reconstruct: High Quality UHD Restoration via Active Feature Disentanglement and Reversible Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yidi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuxin Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xueyang Fu</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12764v1-abstract-short" style="display: inline;"> Ultra-high-definition (UHD) image restoration often faces computational bottlenecks and information loss due to its extremely high resolution. Existing studies based on Variational Autoencoders (VAE) improve efficiency by transferring the image restoration process from pixel space to latent space. However, degraded components are inherently coupled with background elements in degraded images, both… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12764v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12764v1-abstract-full" style="display: none;"> Ultra-high-definition (UHD) image restoration often faces computational bottlenecks and information loss due to its extremely high resolution. Existing studies based on Variational Autoencoders (VAE) improve efficiency by transferring the image restoration process from pixel space to latent space. However, degraded components are inherently coupled with background elements in degraded images, both information loss during compression and information gain during compensation remain uncontrollable. These lead to restored images often exhibiting image detail loss and incomplete degradation removal. To address this issue, we propose a Controlled Differential Disentangled VAE, which utilizes Hierarchical Contrastive Disentanglement Learning and an Orthogonal Gated Projection Module to guide the VAE to actively discard easily recoverable background information while encoding more difficult-to-recover degraded information into the latent space. Additionally, we design a Complex Invertible Multiscale Fusion Network to handle background features, ensuring their consistency, and utilize a latent space restoration network to transform the degraded latent features, leading to more accurate restoration results. Extensive experimental results demonstrate that our method effectively alleviates the information loss problem in VAE models while ensuring computational efficiency, significantly improving the quality of UHD image restoration, and achieves state-of-the-art results in six UHD restoration tasks with only 1M parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12764v1-abstract-full').style.display = 'none'; document.getElementById('2503.12764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12150">arXiv:2503.12150</a> <span> [<a href="https://arxiv.org/pdf/2503.12150">pdf</a>, <a href="https://arxiv.org/format/2503.12150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Point-Cache: Test-time Dynamic and Hierarchical Cache for Robust and Generalizable Point Cloud Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongyu Sun</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+Q">Qiuhong Ke</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongcai Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Deying Li</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+C">Chenhui Gou</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jianfei Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12150v2-abstract-short" style="display: inline;"> This paper proposes a general solution to enable point cloud recognition models to handle distribution shifts at test time. Unlike prior methods, which rely heavily on training data (often inaccessible during online inference) and are limited to recognizing a fixed set of point cloud classes predefined during training, we explore a more practical and challenging scenario: adapting the model solely… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12150v2-abstract-full').style.display = 'inline'; document.getElementById('2503.12150v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12150v2-abstract-full" style="display: none;"> This paper proposes a general solution to enable point cloud recognition models to handle distribution shifts at test time. Unlike prior methods, which rely heavily on training data (often inaccessible during online inference) and are limited to recognizing a fixed set of point cloud classes predefined during training, we explore a more practical and challenging scenario: adapting the model solely based on online test data to recognize both previously seen classes and novel, unseen classes at test time. To this end, we develop \textbf{Point-Cache}, a hierarchical cache model that captures essential clues of online test samples, particularly focusing on the global structure of point clouds and their local-part details. Point-Cache, which serves as a rich 3D knowledge base, is dynamically managed to prioritize the inclusion of high-quality samples. Designed as a plug-and-play module, our method can be flexibly integrated into large multimodal 3D models to support open-vocabulary point cloud recognition. Notably, our solution operates with efficiency comparable to zero-shot inference, as it is entirely training-free. Point-Cache demonstrates substantial gains across 8 challenging benchmarks and 4 representative large 3D models, highlighting its effectiveness. Code is available at https://github.com/auniquesun/Point-Cache. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12150v2-abstract-full').style.display = 'none'; document.getElementById('2503.12150v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025; 24 pages, 14 figures, 18 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11730">arXiv:2503.11730</a> <span> [<a href="https://arxiv.org/pdf/2503.11730">pdf</a>, <a href="https://arxiv.org/format/2503.11730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BACE-RUL: A Bi-directional Adversarial Network with Covariate Encoding for Machine Remaining Useful Life Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shunyu Wu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Junya Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S+K">See Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11730v1-abstract-short" style="display: inline;"> Prognostic and Health Management (PHM) are crucial ways to avoid unnecessary maintenance for Cyber-Physical Systems (CPS) and improve system reliability. Predicting the Remaining Useful Life (RUL) is one of the most challenging tasks for PHM. Existing methods require prior knowledge about the system, contrived assumptions, or temporal mining to model the life cycles of machine equipment/devices, r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11730v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11730v1-abstract-full" style="display: none;"> Prognostic and Health Management (PHM) are crucial ways to avoid unnecessary maintenance for Cyber-Physical Systems (CPS) and improve system reliability. Predicting the Remaining Useful Life (RUL) is one of the most challenging tasks for PHM. Existing methods require prior knowledge about the system, contrived assumptions, or temporal mining to model the life cycles of machine equipment/devices, resulting in diminished accuracy and limited applicability in real-world scenarios. This paper proposes a Bi-directional Adversarial network with Covariate Encoding for machine Remaining Useful Life (BACE-RUL) prediction, which only adopts sensor measurements from the current life cycle to predict RUL rather than relying on previous consecutive cycle recordings. The current sensor measurements of mechanical devices are encoded to a conditional space to better understand the implicit inner mechanical status. The predictor is trained as a conditional generative network with the encoded sensor measurements as its conditions. Various experiments on several real-world datasets, including the turbofan aircraft engine dataset and the dataset collected from degradation experiments of Li-Ion battery cells, show that the proposed model is a general framework and outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11730v1-abstract-full').style.display = 'none'; document.getElementById('2503.11730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been received as a research paper at CollaborateCom 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11345">arXiv:2503.11345</a> <span> [<a href="https://arxiv.org/pdf/2503.11345">pdf</a>, <a href="https://arxiv.org/format/2503.11345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EgoSplat: Open-Vocabulary Egocentric Scene Understanding with Language Embedded 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Di Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jie Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiahao Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+W">Weisheng Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanbin Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Guangming Shi</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+L">Licheng Jiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11345v1-abstract-short" style="display: inline;"> Egocentric scenes exhibit frequent occlusions, varied viewpoints, and dynamic interactions compared to typical scene understanding tasks. Occlusions and varied viewpoints can lead to multi-view semantic inconsistencies, while dynamic objects may act as transient distractors, introducing artifacts into semantic feature modeling. To address these challenges, we propose EgoSplat, a language-embedded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11345v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11345v1-abstract-full" style="display: none;"> Egocentric scenes exhibit frequent occlusions, varied viewpoints, and dynamic interactions compared to typical scene understanding tasks. Occlusions and varied viewpoints can lead to multi-view semantic inconsistencies, while dynamic objects may act as transient distractors, introducing artifacts into semantic feature modeling. To address these challenges, we propose EgoSplat, a language-embedded 3D Gaussian Splatting framework for open-vocabulary egocentric scene understanding. A multi-view consistent instance feature aggregation method is designed to leverage the segmentation and tracking capabilities of SAM2 to selectively aggregate complementary features across views for each instance, ensuring precise semantic representation of scenes. Additionally, an instance-aware spatial-temporal transient prediction module is constructed to improve spatial integrity and temporal continuity in predictions by incorporating spatial-temporal associations across multi-view instances, effectively reducing artifacts in the semantic reconstruction of egocentric scenes. EgoSplat achieves state-of-the-art performance in both localization and segmentation tasks on two datasets, outperforming existing methods with a 8.2% improvement in localization accuracy and a 3.7% improvement in segmentation mIoU on the ADT dataset, and setting a new benchmark in open-vocabulary egocentric scene understanding. The code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11345v1-abstract-full').style.display = 'none'; document.getElementById('2503.11345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11133">arXiv:2503.11133</a> <span> [<a href="https://arxiv.org/pdf/2503.11133">pdf</a>, <a href="https://arxiv.org/format/2503.11133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SpaceSeg: A High-Precision Intelligent Perception Segmentation Method for Multi-Spacecraft On-Orbit Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengyu Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinglei Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11133v1-abstract-short" style="display: inline;"> With the continuous advancement of human exploration into deep space, intelligent perception and high-precision segmentation technology for on-orbit multi-spacecraft targets have become critical factors for ensuring the success of modern space missions. However, the complex deep space environment, diverse imaging conditions, and high variability in spacecraft morphology pose significant challenges… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11133v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11133v1-abstract-full" style="display: none;"> With the continuous advancement of human exploration into deep space, intelligent perception and high-precision segmentation technology for on-orbit multi-spacecraft targets have become critical factors for ensuring the success of modern space missions. However, the complex deep space environment, diverse imaging conditions, and high variability in spacecraft morphology pose significant challenges to traditional segmentation methods. This paper proposes SpaceSeg, an innovative vision foundation model-based segmentation framework with four core technical innovations: First, the Multi-Scale Hierarchical Attention Refinement Decoder (MSHARD) achieves high-precision feature decoding through cross-resolution feature fusion via hierarchical attention. Second, the Multi-spacecraft Connected Component Analysis (MS-CCA) effectively resolves topological structure confusion in dense targets. Third, the Spatial Domain Adaptation Transform framework (SDAT) eliminates cross-domain disparities and resist spatial sensor perturbations through composite enhancement strategies. Finally, a custom Multi-Spacecraft Segmentation Task Loss Function is created to significantly improve segmentation robustness in deep space scenarios. To support algorithm validation, we construct the first multi-scale on-orbit multi-spacecraft semantic segmentation dataset SpaceES, which covers four types of spatial backgrounds and 17 typical spacecraft targets. In testing, SpaceSeg achieves state-of-the-art performance with 89.87$\%$ mIoU and 99.98$\%$ mAcc, surpassing existing best methods by 5.71 percentage points. The dataset and code are open-sourced at https://github.com/Akibaru/SpaceSeg to provide critical technical support for next-generation space situational awareness systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11133v1-abstract-full').style.display = 'none'; document.getElementById('2503.11133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11062">arXiv:2503.11062</a> <span> [<a href="https://arxiv.org/pdf/2503.11062">pdf</a>, <a href="https://arxiv.org/format/2503.11062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Active Learning from Scene Embeddings for End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Duo Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Menghan Hu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11062v1-abstract-short" style="display: inline;"> In the field of autonomous driving, end-to-end deep learning models show great potential by learning driving decisions directly from sensor data. However, training these models requires large amounts of labeled data, which is time-consuming and expensive. Considering that the real-world driving data exhibits a long-tailed distribution where simple scenarios constitute a majority part of the data,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11062v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11062v1-abstract-full" style="display: none;"> In the field of autonomous driving, end-to-end deep learning models show great potential by learning driving decisions directly from sensor data. However, training these models requires large amounts of labeled data, which is time-consuming and expensive. Considering that the real-world driving data exhibits a long-tailed distribution where simple scenarios constitute a majority part of the data, we are thus inspired to identify the most challenging scenarios within it. Subsequently, we can efficiently improve the performance of the model by training with the selected data of the highest value. Prior research has focused on the selection of valuable data by empirically designed strategies. However, manually designed methods suffer from being less generalizable to new data distributions. Observing that the BEV (Bird's Eye View) features in end-to-end models contain all the information required to represent the scenario, we propose an active learning framework that relies on these vectorized scene-level features, called SEAD. The framework selects initial data based on driving-environmental information and incremental data based on BEV features. Experiments show that we only need 30\% of the nuScenes training data to achieve performance close to what can be achieved with the full dataset. The source code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11062v1-abstract-full').style.display = 'none'; document.getElementById('2503.11062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10697">arXiv:2503.10697</a> <span> [<a href="https://arxiv.org/pdf/2503.10697">pdf</a>, <a href="https://arxiv.org/format/2503.10697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Subject-Centric Generation for Creative Application Using Entropy Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+K">Kaifeng Zou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiaoyi Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tao Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zizhou Huang</a>, <a href="/search/cs?searchtype=author&query=Haihang%2C+Z">Zhang Haihang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yuntao Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dagang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10697v1-abstract-short" style="display: inline;"> Generative models are widely used in visual content creation. However, current text-to-image models often face challenges in practical applications-such as textile pattern design and meme generation-due to the presence of unwanted elements that are difficult to separate with existing methods. Meanwhile, subject-reference generation has emerged as a key research trend, highlighting the need for tec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10697v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10697v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10697v1-abstract-full" style="display: none;"> Generative models are widely used in visual content creation. However, current text-to-image models often face challenges in practical applications-such as textile pattern design and meme generation-due to the presence of unwanted elements that are difficult to separate with existing methods. Meanwhile, subject-reference generation has emerged as a key research trend, highlighting the need for techniques that can produce clean, high-quality subject images while effectively removing extraneous components. To address this challenge, we introduce a framework for reliable subject-centric image generation. In this work, we propose an entropy-based feature-weighted fusion method to merge the informative cross-attention features obtained from each sampling step of the pretrained text-to-image model FLUX, enabling a precise mask prediction and subject-centric generation. Additionally, we have developed an agent framework based on Large Language Models (LLMs) that translates users' casual inputs into more descriptive prompts, leading to highly detailed image generation. Simultaneously, the agents extract primary elements of prompts to guide the entropy-based feature fusion, ensuring focused primary element generation without extraneous components. Experimental results and user studies demonstrate our methods generates high-quality subject-centric images, outperform existing methods or other possible pipelines, highlighting the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10697v1-abstract-full').style.display = 'none'; document.getElementById('2503.10697v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10434">arXiv:2503.10434</a> <span> [<a href="https://arxiv.org/pdf/2503.10434">pdf</a>, <a href="https://arxiv.org/format/2503.10434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Finetuning Generative Trajectory Model with Reinforcement Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Derun Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jianwei Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xin Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengxiang Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Leimeng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+K">Kun Zhan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhongpu Xia</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+P">Peng Jia</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+X">Xianpeng Lang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Ningyi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10434v1-abstract-short" style="display: inline;"> Generating human-like and adaptive trajectories is essential for autonomous driving in dynamic environments. While generative models have shown promise in synthesizing feasible trajectories, they often fail to capture the nuanced variability of human driving styles due to dataset biases and distributional shifts. To address this, we introduce TrajHF, a human feedback-driven finetuning framework fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10434v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10434v1-abstract-full" style="display: none;"> Generating human-like and adaptive trajectories is essential for autonomous driving in dynamic environments. While generative models have shown promise in synthesizing feasible trajectories, they often fail to capture the nuanced variability of human driving styles due to dataset biases and distributional shifts. To address this, we introduce TrajHF, a human feedback-driven finetuning framework for generative trajectory models, designed to align motion planning with diverse driving preferences. TrajHF incorporates multi-conditional denoiser and reinforcement learning with human feedback to refine multi-modal trajectory generation beyond conventional imitation learning. This enables better alignment with human driving preferences while maintaining safety and feasibility constraints. TrajHF achieves PDMS of 93.95 on NavSim benchmark, significantly exceeding other methods. TrajHF sets a new paradigm for personalized and adaptable trajectory generation in autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10434v1-abstract-full').style.display = 'none'; document.getElementById('2503.10434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10239">arXiv:2503.10239</a> <span> [<a href="https://arxiv.org/pdf/2503.10239">pdf</a>, <a href="https://arxiv.org/format/2503.10239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> I Can Tell Your Secrets: Inferring Privacy Attributes from Mini-app Interaction History in Super-apps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yifeng Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Mengyu Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junlin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaoke Zhao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xinyi Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangqun Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yao Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Ding Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10239v1-abstract-short" style="display: inline;"> Super-apps have emerged as comprehensive platforms integrating various mini-apps to provide diverse services. While super-apps offer convenience and enriched functionality, they can introduce new privacy risks. This paper reveals a new privacy leakage source in super-apps: mini-app interaction history, including mini-app usage history (Mini-H) and operation history (Op-H). Mini-H refers to the his… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10239v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10239v1-abstract-full" style="display: none;"> Super-apps have emerged as comprehensive platforms integrating various mini-apps to provide diverse services. While super-apps offer convenience and enriched functionality, they can introduce new privacy risks. This paper reveals a new privacy leakage source in super-apps: mini-app interaction history, including mini-app usage history (Mini-H) and operation history (Op-H). Mini-H refers to the history of mini-apps accessed by users, such as their frequency and categories. Op-H captures user interactions within mini-apps, including button clicks, bar drags, and image views. Super-apps can naturally collect these data without instrumentation due to the web-based feature of mini-apps. We identify these data types as novel and unexplored privacy risks through a literature review of 30 papers and an empirical analysis of 31 super-apps. We design a mini-app interaction history-oriented inference attack (THEFT), to exploit this new vulnerability. Using THEFT, the insider threats within the low-privilege business department of the super-app vendor acting as the adversary can achieve more than 95.5% accuracy in inferring privacy attributes of over 16.1% of users. THEFT only requires a small training dataset of 200 users from public breached databases on the Internet. We also engage with super-app vendors and a standards association to increase industry awareness and commitment to protect this data. Our contributions are significant in identifying overlooked privacy risks, demonstrating the effectiveness of a new attack, and influencing industry practices toward better privacy protection in the super-app ecosystem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10239v1-abstract-full').style.display = 'none'; document.getElementById('2503.10239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by USENIX Security 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10218">arXiv:2503.10218</a> <span> [<a href="https://arxiv.org/pdf/2503.10218">pdf</a>, <a href="https://arxiv.org/format/2503.10218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3712274">10.1145/3712274 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Moss: Proxy Model-based Full-Weight Aggregation in Federated Learning with Heterogeneous Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yifeng Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Ding Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yao Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangqun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10218v1-abstract-short" style="display: inline;"> Modern Federated Learning (FL) has become increasingly essential for handling highly heterogeneous mobile devices. Current approaches adopt a partial model aggregation paradigm that leads to sub-optimal model accuracy and higher training overhead. In this paper, we challenge the prevailing notion of partial-model aggregation and propose a novel "full-weight aggregation" method named Moss, which ag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10218v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10218v1-abstract-full" style="display: none;"> Modern Federated Learning (FL) has become increasingly essential for handling highly heterogeneous mobile devices. Current approaches adopt a partial model aggregation paradigm that leads to sub-optimal model accuracy and higher training overhead. In this paper, we challenge the prevailing notion of partial-model aggregation and propose a novel "full-weight aggregation" method named Moss, which aggregates all weights within heterogeneous models to preserve comprehensive knowledge. Evaluation across various applications demonstrates that Moss significantly accelerates training, reduces on-device training time and energy consumption, enhances accuracy, and minimizes network bandwidth utilization when compared to state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10218v1-abstract-full').style.display = 'none'; document.getElementById('2503.10218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM IMWUT/Ubicomp 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10135">arXiv:2503.10135</a> <span> [<a href="https://arxiv.org/pdf/2503.10135">pdf</a>, <a href="https://arxiv.org/format/2503.10135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gumiho: A Hybrid Architecture to Prioritize Early Tokens in Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinze Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yixing Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haiduo Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Ngai%2C+E+C+H">Edith C. H. Ngai</a>, <a href="/search/cs?searchtype=author&query=Barsoum%2C+E">Emad Barsoum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10135v1-abstract-short" style="display: inline;"> Speculative decoding (SPD) aims to accelerate the auto-regressive token generation process of a target Large Language Model (LLM). Some approaches employ a draft model with multiple heads to predict a sequence of future tokens, where each head handles a token in the sequence. The target LLM verifies the predicted sequence and accepts aligned tokens, enabling efficient multi-token generation. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10135v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10135v1-abstract-full" style="display: none;"> Speculative decoding (SPD) aims to accelerate the auto-regressive token generation process of a target Large Language Model (LLM). Some approaches employ a draft model with multiple heads to predict a sequence of future tokens, where each head handles a token in the sequence. The target LLM verifies the predicted sequence and accepts aligned tokens, enabling efficient multi-token generation. However, existing methods assume that all tokens within a sequence are equally important, employing identical head structures and relying on a single-generation paradigm, either serial or parallel. To this end, we theoretically demonstrate that initial tokens in the draft sequence are more important than later ones. Building on this insight, we propose Gumiho, a hybrid model combining serial and parallel heads. Specifically, given the critical importance of early tokens, we employ a sophisticated Transformer architecture for the early draft heads in a serial configuration to improve accuracy. For later tokens, we utilize multiple lightweight MLP heads operating in parallel to enhance efficiency. By allocating more advanced model structures and longer running times to the early heads, Gumiho achieves improved overall performance. The experimental results demonstrate that our method outperforms existing approaches, fully validating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10135v1-abstract-full').style.display = 'none'; document.getElementById('2503.10135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09958">arXiv:2503.09958</a> <span> [<a href="https://arxiv.org/pdf/2503.09958">pdf</a>, <a href="https://arxiv.org/format/2503.09958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Take Off the Training Wheels Progressive In-Context Learning for Effective Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongfang Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinshuo Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinping Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yibin Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Baotian Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09958v1-abstract-short" style="display: inline;"> Recent studies have explored the working mechanisms of In-Context Learning (ICL). However, they mainly focus on classification and simple generation tasks, limiting their broader application to more complex generation tasks in practice. To address this gap, we investigate the impact of demonstrations on token representations within the practical alignment tasks. We find that the transformer embeds… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09958v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09958v1-abstract-full" style="display: none;"> Recent studies have explored the working mechanisms of In-Context Learning (ICL). However, they mainly focus on classification and simple generation tasks, limiting their broader application to more complex generation tasks in practice. To address this gap, we investigate the impact of demonstrations on token representations within the practical alignment tasks. We find that the transformer embeds the task function learned from demonstrations into the separator token representation, which plays an important role in the generation of prior response tokens. Once the prior response tokens are determined, the demonstrations become redundant.Motivated by this finding, we propose an efficient Progressive In-Context Alignment (PICA) method consisting of two stages. In the first few-shot stage, the model generates several prior response tokens via standard ICL while concurrently extracting the ICL vector that stores the task function from the separator token representation. In the following zero-shot stage, this ICL vector guides the model to generate responses without further demonstrations.Extensive experiments demonstrate that our PICA not only surpasses vanilla ICL but also achieves comparable performance to other alignment tuning methods. The proposed training-free method reduces the time cost (e.g., 5.45+) with improved alignment performance (e.g., 6.57+). Consequently, our work highlights the application of ICL for alignment and calls for a deeper understanding of ICL for complex generations. The code will be available at https://github.com/HITsz-TMG/PICA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09958v1-abstract-full').style.display = 'none'; document.getElementById('2503.09958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 9 figures, published in EMNLP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09657">arXiv:2503.09657</a> <span> [<a href="https://arxiv.org/pdf/2503.09657">pdf</a>, <a href="https://arxiv.org/format/2503.09657">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> T媒r-the-Pruner: Unlocking Accurate 50% Structural Pruning for LLMs via Global Sparsity Distribution Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanchen Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yixing Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeping Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Ji Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Barsoum%2C+E">Emad Barsoum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09657v2-abstract-short" style="display: inline;"> Structural pruning enhances hardware-agnostic inference efficiency for large language models (LLMs) but often struggles to maintain performance. Local pruning performs efficient layer-by-layer compression but ignores global topology. Global pruning has the potential to find the optimal solution although resource-intensive. However, existing methods tend to rank structural saliency uniformly, ignor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09657v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09657v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09657v2-abstract-full" style="display: none;"> Structural pruning enhances hardware-agnostic inference efficiency for large language models (LLMs) but often struggles to maintain performance. Local pruning performs efficient layer-by-layer compression but ignores global topology. Global pruning has the potential to find the optimal solution although resource-intensive. However, existing methods tend to rank structural saliency uniformly, ignoring inter-structure dependencies and failing to achieve end-to-end optimization. To address these limitations, we propose T媒r-the-Pruner, an efficient end-to-end search-based global structural pruning framework. This framework constructs a supernet by repeatedly applying local pruning across a range of sparsity ratios to each layer in an LLM, with the core goal of determining the optimal sparsity distribution under a target overall sparsity ratio. Concretely, we introduce an effective local pruning and an expectation error accumulation approach to improve supernet construction. Furthermore, we employ an iterative prune-and-search strategy with coarse-to-fine sparsity granularity to ensure efficient search convergence. Experimental results show that T媒r-the-Pruner achieves state-of-the-art structural pruning, retaining 97% of the dense model's performance while removing a challenging 50% of Llama-3.1-70B's parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09657v2-abstract-full').style.display = 'none'; document.getElementById('2503.09657v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09643">arXiv:2503.09643</a> <span> [<a href="https://arxiv.org/pdf/2503.09643">pdf</a>, <a href="https://arxiv.org/format/2503.09643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedMSGL: A Self-Expressive Hypergraph Based Federated Multi-View Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Daoyuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zuyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09643v1-abstract-short" style="display: inline;"> Federated learning is essential for enabling collaborative model training across decentralized data sources while preserving data privacy and security. This approach mitigates the risks associated with centralized data collection and addresses concerns related to data ownership and compliance. Despite significant advancements in federated learning algorithms that address communication bottlenecks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09643v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09643v1-abstract-full" style="display: none;"> Federated learning is essential for enabling collaborative model training across decentralized data sources while preserving data privacy and security. This approach mitigates the risks associated with centralized data collection and addresses concerns related to data ownership and compliance. Despite significant advancements in federated learning algorithms that address communication bottlenecks and enhance privacy protection, existing works overlook the impact of differences in data feature dimensions, resulting in global models that disproportionately depend on participants with large feature dimensions. Additionally, current single-view federated learning methods fail to account for the unique characteristics of multi-view data, leading to suboptimal performance in processing such data. To address these issues, we propose a Self-expressive Hypergraph Based Federated Multi-view Learning method (FedMSGL). The proposed method leverages self-expressive character in the local training to learn uniform dimension subspace with latent sample relation. At the central side, an adaptive fusion technique is employed to generate the global model, while constructing a hypergraph from the learned global and view-specific subspace to capture intricate interconnections across views. Experiments on multi-view datasets with different feature dimensions validated the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09643v1-abstract-full').style.display = 'none'; document.getElementById('2503.09643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09366">arXiv:2503.09366</a> <span> [<a href="https://arxiv.org/pdf/2503.09366">pdf</a>, <a href="https://arxiv.org/format/2503.09366">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Post-interactive Multimodal Trajectory Prediction for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyi Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dushuai Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Hongmao Qin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+N">Nan Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09366v1-abstract-short" style="display: inline;"> Modeling the interactions among agents for trajectory prediction of autonomous driving has been challenging due to the inherent uncertainty in agents' behavior. The interactions involved in the predicted trajectories of agents, also called post-interactions, have rarely been considered in trajectory prediction models. To this end, we propose a coarse-to-fine Transformer for multimodal trajectory p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09366v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09366v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09366v1-abstract-full" style="display: none;"> Modeling the interactions among agents for trajectory prediction of autonomous driving has been challenging due to the inherent uncertainty in agents' behavior. The interactions involved in the predicted trajectories of agents, also called post-interactions, have rarely been considered in trajectory prediction models. To this end, we propose a coarse-to-fine Transformer for multimodal trajectory prediction, i.e., Pioformer, which explicitly extracts the post-interaction features to enhance the prediction accuracy. Specifically, we first build a Coarse Trajectory Network to generate coarse trajectories based on the observed trajectories and lane segments, in which the low-order interaction features are extracted with the graph neural networks. Next, we build a hypergraph neural network-based Trajectory Proposal Network to generate trajectory proposals, where the high-order interaction features are learned by the hypergraphs. Finally, the trajectory proposals are sent to the Proposal Refinement Network for further refinement. The observed trajectories and trajectory proposals are concatenated together as the inputs of the Proposal Refinement Network, in which the post-interaction features are learned by combining the previous interaction features and trajectory consistency features. Moreover, we propose a three-stage training scheme to facilitate the learning process. Extensive experiments on the Argoverse 1 dataset demonstrate the superiority of our method. Compared with the baseline HiVT-64, our model has reduced the prediction errors by 4.4%, 8.4%, 14.4%, 5.7% regarding metrics minADE6, minFDE6, MR6, and brier-minFDE6, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09366v1-abstract-full').style.display = 'none'; document.getElementById('2503.09366v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09091">arXiv:2503.09091</a> <span> [<a href="https://arxiv.org/pdf/2503.09091">pdf</a>, <a href="https://arxiv.org/format/2503.09091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Foundation Models for Computational Pathology: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Guihong Wan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xintao Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinyu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaohui Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yi He</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+C+G">Christine G. Lian</a>, <a href="/search/cs?searchtype=author&query=Sorger%2C+P+K">Peter K. Sorger</a>, <a href="/search/cs?searchtype=author&query=Semenov%2C+Y+R">Yevgeniy R. Semenov</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09091v2-abstract-short" style="display: inline;"> Foundation models have emerged as a powerful paradigm in computational pathology (CPath), enabling scalable and generalizable analysis of histopathological images. While early developments centered on uni-modal models trained solely on visual data, recent advances have highlighted the promise of multi-modal foundation models that integrate heterogeneous data sources such as textual reports, struct… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09091v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09091v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09091v2-abstract-full" style="display: none;"> Foundation models have emerged as a powerful paradigm in computational pathology (CPath), enabling scalable and generalizable analysis of histopathological images. While early developments centered on uni-modal models trained solely on visual data, recent advances have highlighted the promise of multi-modal foundation models that integrate heterogeneous data sources such as textual reports, structured domain knowledge, and molecular profiles. In this survey, we provide a comprehensive and up-to-date review of multi-modal foundation models in CPath, with a particular focus on models built upon hematoxylin and eosin (H&E) stained whole slide images (WSIs) and tile-level representations. We categorize 32 state-of-the-art multi-modal foundation models into three major paradigms: vision-language, vision-knowledge graph, and vision-gene expression. We further divide vision-language models into non-LLM-based and LLM-based approaches. Additionally, we analyze 28 available multi-modal datasets tailored for pathology, grouped into image-text pairs, instruction datasets, and image-other modality pairs. Our survey also presents a taxonomy of downstream tasks, highlights training and evaluation strategies, and identifies key challenges and future directions. We aim for this survey to serve as a valuable resource for researchers and practitioners working at the intersection of pathology and AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09091v2-abstract-full').style.display = 'none'; document.getElementById('2503.09091v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08919">arXiv:2503.08919</a> <span> [<a href="https://arxiv.org/pdf/2503.08919">pdf</a>, <a href="https://arxiv.org/format/2503.08919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Backtracking for Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sel%2C+B">Bilgehan Sel</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dingcheng Li</a>, <a href="/search/cs?searchtype=author&query=Wallis%2C+P">Phillip Wallis</a>, <a href="/search/cs?searchtype=author&query=Keshava%2C+V">Vaishakh Keshava</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&query=Jonnalagadda%2C+S+R">Siddhartha Reddy Jonnalagadda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08919v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable capabilities across various tasks, but ensuring their safety and alignment with human values remains crucial. Current safety alignment methods, such as supervised fine-tuning and reinforcement learning-based approaches, can exhibit vulnerabilities to adversarial attacks and often result in shallow safety alignment, primarily focusing on pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08919v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08919v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable capabilities across various tasks, but ensuring their safety and alignment with human values remains crucial. Current safety alignment methods, such as supervised fine-tuning and reinforcement learning-based approaches, can exhibit vulnerabilities to adversarial attacks and often result in shallow safety alignment, primarily focusing on preventing harmful content in the initial tokens of the generated output. While methods like resetting can help recover from unsafe generations by discarding previous tokens and restarting the generation process, they are not well-suited for addressing nuanced safety violations like toxicity that may arise within otherwise benign and lengthy generations. In this paper, we propose a novel backtracking method designed to address these limitations. Our method allows the model to revert to a safer generation state, not necessarily at the beginning, when safety violations occur during generation. This approach enables targeted correction of problematic segments without discarding the entire generated text, thereby preserving efficiency. We demonstrate that our method dramatically reduces toxicity appearing through the generation process with minimal impact to efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08919v1-abstract-full').style.display = 'none'; document.getElementById('2503.08919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08604">arXiv:2503.08604</a> <span> [<a href="https://arxiv.org/pdf/2503.08604">pdf</a>, <a href="https://arxiv.org/format/2503.08604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EMMOE: A Comprehensive Benchmark for Embodied Mobile Manipulation in Open Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongping Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tielong Cai</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tianci Tang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Driggs-Campbell%2C+K+R">Katherine Rose Driggs-Campbell</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaoang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08604v1-abstract-short" style="display: inline;"> Developing autonomous home robots controlled by natural language has long been a pursuit of human. While advancements in large language models (LLMs) and embodied intelligence make this goal closer, several challenges persist: the lack of a unified benchmark for more complex robot tasks, limited evaluation methods and metrics, data incompatibility between LLMs and mobile manipulation trajectories.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08604v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08604v1-abstract-full" style="display: none;"> Developing autonomous home robots controlled by natural language has long been a pursuit of human. While advancements in large language models (LLMs) and embodied intelligence make this goal closer, several challenges persist: the lack of a unified benchmark for more complex robot tasks, limited evaluation methods and metrics, data incompatibility between LLMs and mobile manipulation trajectories. To address these issues, we introduce Embodied Mobile Manipulation in Open Environments (EMMOE), which requires agents to interpret user instructions and execute long-horizon everyday tasks in continuous space. EMMOE seamlessly integrates high-level and low-level embodied tasks into a unified framework, along with three new metrics for more diverse assessment. Additionally, we collect EMMOE-100, which features in various task attributes, detailed process annotations, re-plans after failures, and two sub-datasets for LLM training. Furthermore, we design HomieBot, a sophisticated agent system consists of LLM with Direct Preference Optimization (DPO), light weighted navigation and manipulation models, and multiple error detection mechanisms. Finally, we demonstrate HomieBot's performance and the evaluation of different models and policies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08604v1-abstract-full').style.display = 'none'; document.getElementById('2503.08604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08548">arXiv:2503.08548</a> <span> [<a href="https://arxiv.org/pdf/2503.08548">pdf</a>, <a href="https://arxiv.org/format/2503.08548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TLA: Tactile-Language-Action Model for Contact-Rich Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+P">Peng Hao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dingzhe Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaoge Cao</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shaowei Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08548v1-abstract-short" style="display: inline;"> Significant progress has been made in vision-language models. However, language-conditioned robotic manipulation for contact-rich tasks remains underexplored, particularly in terms of tactile sensing. To address this gap, we introduce the Tactile-Language-Action (TLA) model, which effectively processes sequential tactile feedback via cross-modal language grounding to enable robust policy generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08548v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08548v1-abstract-full" style="display: none;"> Significant progress has been made in vision-language models. However, language-conditioned robotic manipulation for contact-rich tasks remains underexplored, particularly in terms of tactile sensing. To address this gap, we introduce the Tactile-Language-Action (TLA) model, which effectively processes sequential tactile feedback via cross-modal language grounding to enable robust policy generation in contact-intensive scenarios. In addition, we construct a comprehensive dataset that contains 24k pairs of tactile action instruction data, customized for fingertip peg-in-hole assembly, providing essential resources for TLA training and evaluation. Our results show that TLA significantly outperforms traditional imitation learning methods (e.g., diffusion policy) in terms of effective action generation and action accuracy, while demonstrating strong generalization capabilities by achieving over 85\% success rate on previously unseen assembly clearances and peg shapes. We publicly release all data and code in the hope of advancing research in language-conditioned tactile manipulation skill learning. Project website: https://sites.google.com/view/tactile-language-action/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08548v1-abstract-full').style.display = 'none'; document.getElementById('2503.08548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08429">arXiv:2503.08429</a> <span> [<a href="https://arxiv.org/pdf/2503.08429">pdf</a>, <a href="https://arxiv.org/format/2503.08429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Using Powerful Prior Knowledge of Diffusion Model in Deep Unfolding Networks for Image Compressive Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+C">Chen Liao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yan Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongli Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08429v1-abstract-short" style="display: inline;"> Recently, Deep Unfolding Networks (DUNs) have achieved impressive reconstruction quality in the field of image Compressive Sensing (CS) by unfolding iterative optimization algorithms into neural networks. The reconstruction quality of DUNs depends on the learned prior knowledge, so introducing stronger prior knowledge can further improve reconstruction quality. On the other hand, pre-trained diffu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08429v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08429v1-abstract-full" style="display: none;"> Recently, Deep Unfolding Networks (DUNs) have achieved impressive reconstruction quality in the field of image Compressive Sensing (CS) by unfolding iterative optimization algorithms into neural networks. The reconstruction quality of DUNs depends on the learned prior knowledge, so introducing stronger prior knowledge can further improve reconstruction quality. On the other hand, pre-trained diffusion models contain powerful prior knowledge and have a solid theoretical foundation and strong scalability, but it requires a large number of iterative steps to achieve reconstruction. In this paper, we propose to use the powerful prior knowledge of pre-trained diffusion model in DUNs to achieve high-quality reconstruction with less steps for image CS. Specifically, we first design an iterative optimization algorithm named Diffusion Message Passing (DMP), which embeds a pre-trained diffusion model into each iteration process of DMP. Then, we deeply unfold the DMP algorithm into a neural network named DMP-DUN. The proposed DMP-DUN can use lightweight neural networks to achieve mapping from measurement data to the intermediate steps of the reverse diffusion process and directly approximate the divergence of the diffusion model, thereby further improving reconstruction efficiency. Extensive experiments show that our proposed DMP-DUN achieves state-of-the-art performance and requires at least only 2 steps to reconstruct the image. Codes are available at https://github.com/FengodChen/DMP-DUN-CVPR2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08429v1-abstract-full').style.display = 'none'; document.getElementById('2503.08429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08308">arXiv:2503.08308</a> <span> [<a href="https://arxiv.org/pdf/2503.08308">pdf</a>, <a href="https://arxiv.org/format/2503.08308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Seeing and Reasoning with Confidence: Supercharging Multimodal LLMs with an Uncertainty-Aware Agentic Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhi%2C+Z">Zhuo Zhi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Daneshmend%2C+A">Adam Daneshmend</a>, <a href="/search/cs?searchtype=author&query=Orlu%2C+M">Mine Orlu</a>, <a href="/search/cs?searchtype=author&query=Demosthenous%2C+A">Andreas Demosthenous</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Lu Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Da Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziquan Liu</a>, <a href="/search/cs?searchtype=author&query=Rodrigues%2C+M+R+D">Miguel R. D. Rodrigues</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08308v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) show promise in tasks like visual question answering (VQA) but still face challenges in multimodal reasoning. Recent works adapt agentic frameworks or chain-of-thought (CoT) reasoning to improve performance. However, CoT-based multimodal reasoning often demands costly data annotation and fine-tuning, while agentic approaches relying on external tools risk i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08308v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08308v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) show promise in tasks like visual question answering (VQA) but still face challenges in multimodal reasoning. Recent works adapt agentic frameworks or chain-of-thought (CoT) reasoning to improve performance. However, CoT-based multimodal reasoning often demands costly data annotation and fine-tuning, while agentic approaches relying on external tools risk introducing unreliable output from these tools. In this paper, we propose Seeing and Reasoning with Confidence (SRICE), a training-free multimodal reasoning framework that integrates external vision models with uncertainty quantification (UQ) into an MLLM to address these challenges. Specifically, SRICE guides the inference process by allowing MLLM to autonomously select regions of interest through multi-stage interactions with the help of external tools. We propose to use a conformal prediction-based approach to calibrate the output of external tools and select the optimal tool by estimating the uncertainty of an MLLM's output. Our experiment shows that the average improvement of SRICE over the base MLLM is 4.6% on five datasets and the performance on some datasets even outperforms fine-tuning-based methods, revealing the significance of ensuring reliable tool use in an MLLM agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08308v1-abstract-full').style.display = 'none'; document.getElementById('2503.08308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08207">arXiv:2503.08207</a> <span> [<a href="https://arxiv.org/pdf/2503.08207">pdf</a>, <a href="https://arxiv.org/format/2503.08207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> To Use or Not to Use a Universal Force Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Denan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangkai Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lintao Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08207v1-abstract-short" style="display: inline;"> Artificial intelligence (AI) is revolutionizing scientific research, particularly in computational materials science, by enabling more accurate and efficient simulations. Machine learning force fields (MLFFs) have emerged as powerful tools for molecular dynamics (MD) simulations, potentially offering quantum-mechanical accuracy with the efficiency of classical MD. This Perspective evaluates the vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08207v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08207v1-abstract-full" style="display: none;"> Artificial intelligence (AI) is revolutionizing scientific research, particularly in computational materials science, by enabling more accurate and efficient simulations. Machine learning force fields (MLFFs) have emerged as powerful tools for molecular dynamics (MD) simulations, potentially offering quantum-mechanical accuracy with the efficiency of classical MD. This Perspective evaluates the viability of universal MLFFs for simulating complex materials systems from the standpoint of a potential practitioner. Using the temperature-driven ferroelectric-paraelectric phase transition of PbTiO$_3$ as a benchmark, we assess leading universal force fields, including CHGNet, MACE, M3GNet, and GPTFF, alongside specialized models like UniPero. While universal MLFFs trained on PBE-derived datasets perform well in predicting equilibrium properties, they largely fail to capture realistic finite-temperature phase transitions under constant-pressure MD, often exhibiting unphysical instabilities. These shortcomings stem from inherited biases in exchange-correlation functionals and limited generalization to anharmonic interactions governing dynamic behavior. However, fine-tuning universal models or employing system-specific MLFFs like UniPero successfully restores predictive accuracy. We advocates for hybrid approaches combining universal pretraining with targeted optimization, improved error quantification frameworks, and community-driven benchmarks to advance MLFFs as robust tools for computational materials discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08207v1-abstract-full').style.display = 'none'; document.getElementById('2503.08207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08170">arXiv:2503.08170</a> <span> [<a href="https://arxiv.org/pdf/2503.08170">pdf</a>, <a href="https://arxiv.org/format/2503.08170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CQVPR: Landmark-aware Contextual Queries for Visual Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongyue Li</a>, <a href="/search/cs?searchtype=author&query=Deguchi%2C+D">Daisuke Deguchi</a>, <a href="/search/cs?searchtype=author&query=Murase%2C+H">Hiroshi Murase</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08170v1-abstract-short" style="display: inline;"> Visual Place Recognition (VPR) aims to estimate the location of the given query image within a database of geo-tagged images. To identify the exact location in an image, detecting landmarks is crucial. However, in some scenarios, such as urban environments, there are numerous landmarks, such as various modern buildings, and the landmarks in different cities often exhibit high visual similarity. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08170v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08170v1-abstract-full" style="display: none;"> Visual Place Recognition (VPR) aims to estimate the location of the given query image within a database of geo-tagged images. To identify the exact location in an image, detecting landmarks is crucial. However, in some scenarios, such as urban environments, there are numerous landmarks, such as various modern buildings, and the landmarks in different cities often exhibit high visual similarity. Therefore, it is essential not only to leverage the landmarks but also to consider the contextual information surrounding them, such as whether there are trees, roads, or other features around the landmarks. We propose the Contextual Query VPR (CQVPR), which integrates contextual information with detailed pixel-level visual features. By leveraging a set of learnable contextual queries, our method automatically learns the high-level contexts with respect to landmarks and their surrounding areas. Heatmaps depicting regions that each query attends to serve as context-aware features, offering cues that could enhance the understanding of each scene. We further propose a query matching loss to supervise the extraction process of contextual queries. Extensive experiments on several datasets demonstrate that the proposed method outperforms other state-of-the-art methods, especially in challenging scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08170v1-abstract-full').style.display = 'none'; document.getElementById('2503.08170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07667">arXiv:2503.07667</a> <span> [<a href="https://arxiv.org/pdf/2503.07667">pdf</a>, <a href="https://arxiv.org/format/2503.07667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> CLIMB: Data Foundations for Large Scale Multimodal Clinical Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peilin Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Malinda Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Daniel Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haowen Wei</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Hejie Cui</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P+P">Paul Pu Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07667v2-abstract-short" style="display: inline;"> Recent advances in clinical AI have enabled remarkable progress across many clinical domains. However, existing benchmarks and models are primarily limited to a small set of modalities and tasks, which hinders the development of large-scale multimodal methods that can make holistic assessments of patient health and well-being. To bridge this gap, we introduce Clinical Large-Scale Integrative Multi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07667v2-abstract-full').style.display = 'inline'; document.getElementById('2503.07667v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07667v2-abstract-full" style="display: none;"> Recent advances in clinical AI have enabled remarkable progress across many clinical domains. However, existing benchmarks and models are primarily limited to a small set of modalities and tasks, which hinders the development of large-scale multimodal methods that can make holistic assessments of patient health and well-being. To bridge this gap, we introduce Clinical Large-Scale Integrative Multimodal Benchmark (CLIMB), a comprehensive clinical benchmark unifying diverse clinical data across imaging, language, temporal, and graph modalities. CLIMB comprises 4.51 million patient samples totaling 19.01 terabytes distributed across 2D imaging, 3D video, time series, graphs, and multimodal data. Through extensive empirical evaluation, we demonstrate that multitask pretraining significantly improves performance on understudied domains, achieving up to 29% improvement in ultrasound and 23% in ECG analysis over single-task learning. Pretraining on CLIMB also effectively improves models' generalization capability to new tasks, and strong unimodal encoder performance translates well to multimodal performance when paired with task-appropriate fusion strategies. Our findings provide a foundation for new architecture designs and pretraining strategies to advance clinical AI research. Code is released at https://github.com/DDVD233/climb. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07667v2-abstract-full').style.display = 'none'; document.getElementById('2503.07667v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07523">arXiv:2503.07523</a> <span> [<a href="https://arxiv.org/pdf/2503.07523">pdf</a>, <a href="https://arxiv.org/format/2503.07523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VisRL: Intention-Driven Visual Perception via Reinforced Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhangquan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xufang Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07523v1-abstract-short" style="display: inline;"> Visual understanding is inherently intention-driven - humans selectively focus on different regions of a scene based on their goals. Recent advances in large multimodal models (LMMs) enable flexible expression of such intentions through natural language, allowing queries to guide visual reasoning processes. Frameworks like Visual Chain-of-Thought have demonstrated the benefit of incorporating expl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07523v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07523v1-abstract-full" style="display: none;"> Visual understanding is inherently intention-driven - humans selectively focus on different regions of a scene based on their goals. Recent advances in large multimodal models (LMMs) enable flexible expression of such intentions through natural language, allowing queries to guide visual reasoning processes. Frameworks like Visual Chain-of-Thought have demonstrated the benefit of incorporating explicit reasoning steps, where the model predicts a focus region before answering a query. However, existing approaches rely heavily on supervised training with annotated intermediate bounding boxes, which severely limits scalability due to the combinatorial explosion of intention-region pairs. To overcome this limitation, we propose VisRL, the first framework that applies reinforcement learning (RL) to the problem of intention-driven visual perception. VisRL optimizes the entire visual reasoning process using only reward signals. By treating intermediate focus selection as a internal decision optimized through trial-and-error, our method eliminates the need for costly region annotations while aligning more closely with how humans learn to perceive the world. Extensive experiments across multiple benchmarks show that VisRL consistently outperforms strong baselines, demonstrating both its effectiveness and its strong generalization across different LMMs. Our code is available at this [URL](https://github.com/zhangquanchen/VisRL). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07523v1-abstract-full').style.display = 'none'; document.getElementById('2503.07523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18pages,11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07331">arXiv:2503.07331</a> <span> [<a href="https://arxiv.org/pdf/2503.07331">pdf</a>, <a href="https://arxiv.org/format/2503.07331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> TANGO: A Robust Qubit Mapping Algorithm via Two-Stage Search and Bidirectional Look </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yukun Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dandan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07331v1-abstract-short" style="display: inline;"> Current quantum devices typically lack full qubit connectivity, making it difficult to directly execute logical circuits on quantum devices. This limitation necessitates quantum circuit mapping algorithms to insert SWAP gates, dynamically remapping logical qubits to physical qubits and transforming logical circuits into physical circuits that comply with device connectivity constraints. However, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07331v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07331v1-abstract-full" style="display: none;"> Current quantum devices typically lack full qubit connectivity, making it difficult to directly execute logical circuits on quantum devices. This limitation necessitates quantum circuit mapping algorithms to insert SWAP gates, dynamically remapping logical qubits to physical qubits and transforming logical circuits into physical circuits that comply with device connectivity constraints. However, the insertion of SWAP gates increases both the gate count and circuit depth, ultimately reducing the fidelity of quantum algorithms. To achieve a balanced optimization of these two objectives, we propose the TANGO algorithm. By incorporating a layer-weight allocation strategy, the algorithm first formulates an evaluation function that balances the impact of qubit mapping on both mapped and unmapped nodes, thereby enhancing the quality of the initial mapping. Next, we design an innovative two-stage routing algorithm that prioritizes the number of executable gates as the primary evaluation metric while also considering quantum gate distance, circuit depth, and a novel bidirectional-look SWAP strategy, which optimizes SWAP gate selection in conjunction with preceding gates, improving the effectiveness of the mapping algorithm. Finally, by integrating advanced quantum gate optimization techniques, the algorithm's overall performance is further enhanced. Experimental results demonstrate that, compared to state-of-the-art methods, the proposed algorithm achieves multi-objective co-optimization of gate count and circuit depth across various benchmarks and quantum devices, exhibiting significant performance advantages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07331v1-abstract-full').style.display = 'none'; document.getElementById('2503.07331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07215">arXiv:2503.07215</a> <span> [<a href="https://arxiv.org/pdf/2503.07215">pdf</a>, <a href="https://arxiv.org/format/2503.07215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Control Flow-Augmented Decompiler based on Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peipei Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jian Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhaoteng Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peizheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Dapeng Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dawei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07215v1-abstract-short" style="display: inline;"> Binary decompilation plays a crucial role in various tasks related to security threat analysis and software engineering, such as binary vulnerability detection and software supply chain analysis. Current prevalent binary decompilation methods primarily rely on large language models (LLMs) and can be broadly classified into two main approaches: prompt-based decompilation and end-toend decompilation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07215v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07215v1-abstract-full" style="display: none;"> Binary decompilation plays a crucial role in various tasks related to security threat analysis and software engineering, such as binary vulnerability detection and software supply chain analysis. Current prevalent binary decompilation methods primarily rely on large language models (LLMs) and can be broadly classified into two main approaches: prompt-based decompilation and end-toend decompilation. Prompt-based methods typically require significant effort to analyze and summarize the predicted data to extract aspect-specific expert knowledge, which is then fed into a general purpose large language model to address specific decompilation tasks. End-to-end methods, on the other hand, carefully construct training datasets or neural networks to perform post-training on general-purpose large language models, thereby obtaining domain-specific large language models for decompiling the predicted data. However, both existing approaches still face significant challenges, including the absence of rich semantic representations of the input code and the neglect of control flow information, which is crucial for accurate decompilation. Furthermore, most current decompilation techniques are specifically tailored for the x86 architecture, making it difficult to efficiently adapt and generalize them to other bit width or instruction architectures. To address these limitations, we propose a novel end-to-end decompilation LLM, CFADecLLM, which aims to enhance existing end-to-end decompilation methods. We conduct extensive experiments on the public dataset Humaneval and Exebench across four optimization levels, and results demonstrate that our approach outperforms existing methods in multiple metrics, validating its effectiveness and superiority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07215v1-abstract-full').style.display = 'none'; document.getElementById('2503.07215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06955">arXiv:2503.06955</a> <span> [<a href="https://arxiv.org/pdf/2503.06955">pdf</a>, <a href="https://arxiv.org/format/2503.06955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motion Anything: Any to Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiran Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Wei Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Danning Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Biao Wu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+B">Bohan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Reid%2C+I">Ian Reid</a>, <a href="/search/cs?searchtype=author&query=Hartley%2C+R">Richard Hartley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06955v2-abstract-short" style="display: inline;"> Conditional motion generation has been extensively studied in computer vision, yet two critical challenges remain. First, while masked autoregressive methods have recently outperformed diffusion-based approaches, existing masking models lack a mechanism to prioritize dynamic frames and body parts based on given conditions. Second, existing methods for different conditioning modalities often fail t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06955v2-abstract-full').style.display = 'inline'; document.getElementById('2503.06955v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06955v2-abstract-full" style="display: none;"> Conditional motion generation has been extensively studied in computer vision, yet two critical challenges remain. First, while masked autoregressive methods have recently outperformed diffusion-based approaches, existing masking models lack a mechanism to prioritize dynamic frames and body parts based on given conditions. Second, existing methods for different conditioning modalities often fail to integrate multiple modalities effectively, limiting control and coherence in generated motion. To address these challenges, we propose Motion Anything, a multimodal motion generation framework that introduces an Attention-based Mask Modeling approach, enabling fine-grained spatial and temporal control over key frames and actions. Our model adaptively encodes multimodal conditions, including text and music, improving controllability. Additionally, we introduce Text-Music-Dance (TMD), a new motion dataset consisting of 2,153 pairs of text, music, and dance, making it twice the size of AIST++, thereby filling a critical gap in the community. Extensive experiments demonstrate that Motion Anything surpasses state-of-the-art methods across multiple benchmarks, achieving a 15% improvement in FID on HumanML3D and showing consistent performance gains on AIST++ and TMD. See our project website https://steve-zeyu-zhang.github.io/MotionAnything <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06955v2-abstract-full').style.display = 'none'; document.getElementById('2503.06955v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06885">arXiv:2503.06885</a> <span> [<a href="https://arxiv.org/pdf/2503.06885">pdf</a>, <a href="https://arxiv.org/format/2503.06885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProBench: Judging Multimodal Foundation Models on Open-ended Multi-domain Expert Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongxu Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoning Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liyuan Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junnan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06885v1-abstract-short" style="display: inline;"> Solving expert-level multimodal tasks is a key milestone towards general intelligence. As the capabilities of multimodal large language models (MLLMs) continue to improve, evaluation of such advanced multimodal intelligence becomes necessary yet challenging. In this work, we introduce ProBench, a benchmark of open-ended user queries that require professional expertise and advanced reasoning. ProBe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06885v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06885v1-abstract-full" style="display: none;"> Solving expert-level multimodal tasks is a key milestone towards general intelligence. As the capabilities of multimodal large language models (MLLMs) continue to improve, evaluation of such advanced multimodal intelligence becomes necessary yet challenging. In this work, we introduce ProBench, a benchmark of open-ended user queries that require professional expertise and advanced reasoning. ProBench consists of 4,000 high-quality samples independently submitted by professionals based on their daily productivity demands. It spans across 10 fields and 56 sub-fields, including science, arts, humanities, coding, mathematics, and creative writing. Experimentally, we evaluate and compare 24 latest models using MLLM-as-a-Judge. Our results reveal that although the best open-source models rival the proprietary ones, ProBench presents significant challenges in visual perception, textual understanding, domain knowledge and advanced reasoning, thus providing valuable directions for future multimodal AI research efforts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06885v1-abstract-full').style.display = 'none'; document.getElementById('2503.06885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06247">arXiv:2503.06247</a> <span> [<a href="https://arxiv.org/pdf/2503.06247">pdf</a>, <a href="https://arxiv.org/format/2503.06247">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Infant Cry Detection Using Causal Temporal Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+M">Minghao Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Danning Li</a>, <a href="/search/cs?searchtype=author&query=Gadhiya%2C+A">Aryan Gadhiya</a>, <a href="/search/cs?searchtype=author&query=Lambright%2C+B">Benjamin Lambright</a>, <a href="/search/cs?searchtype=author&query=Alowais%2C+M">Mohamed Alowais</a>, <a href="/search/cs?searchtype=author&query=Bahnassy%2C+M">Mohab Bahnassy</a>, <a href="/search/cs?searchtype=author&query=Elletter%2C+S+E+D">Saad El Dine Elletter</a>, <a href="/search/cs?searchtype=author&query=Toyin%2C+H+O">Hawau Olamide Toyin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Aldarmaki%2C+H">Hanan Aldarmaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06247v1-abstract-short" style="display: inline;"> This paper addresses a major challenge in acoustic event detection, in particular infant cry detection in the presence of other sounds and background noises: the lack of precise annotated data. We present two contributions for supervised and unsupervised infant cry detection. The first is an annotated dataset for cry segmentation, which enables supervised models to achieve state-of-the-art perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06247v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06247v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06247v1-abstract-full" style="display: none;"> This paper addresses a major challenge in acoustic event detection, in particular infant cry detection in the presence of other sounds and background noises: the lack of precise annotated data. We present two contributions for supervised and unsupervised infant cry detection. The first is an annotated dataset for cry segmentation, which enables supervised models to achieve state-of-the-art performance. Additionally, we propose a novel unsupervised method, Causal Representation Spare Transition Clustering (CRSTC), based on causal temporal representation, which helps address the issue of data scarcity more generally. By integrating the detected cry segments, we significantly improve the performance of downstream infant cry classification, highlighting the potential of this approach for infant care applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06247v1-abstract-full').style.display = 'none'; document.getElementById('2503.06247v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06084">arXiv:2503.06084</a> <span> [<a href="https://arxiv.org/pdf/2503.06084">pdf</a>, <a href="https://arxiv.org/format/2503.06084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Interpretability for Visual Prompt Tuning with Hierarchical Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yubin Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinyang Jiang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">De Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangqian Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Cairong Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06084v1-abstract-short" style="display: inline;"> Visual prompt tuning offers significant advantages for adapting pre-trained visual foundation models to specific tasks. However, current research provides limited insight into the interpretability of this approach, which is essential for enhancing AI reliability and enabling AI-driven knowledge discovery. In this paper, rather than learning abstract prompt embeddings, we propose the first framewor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06084v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06084v1-abstract-full" style="display: none;"> Visual prompt tuning offers significant advantages for adapting pre-trained visual foundation models to specific tasks. However, current research provides limited insight into the interpretability of this approach, which is essential for enhancing AI reliability and enabling AI-driven knowledge discovery. In this paper, rather than learning abstract prompt embeddings, we propose the first framework, named Interpretable Visual Prompt Tuning (IVPT), to explore interpretability for visual prompts, by introducing hierarchical concept prototypes. Specifically, visual prompts are linked to human-understandable semantic concepts, represented as a set of category-agnostic prototypes, each corresponding to a specific region of the image. Then, IVPT aggregates features from these regions to generate interpretable prompts, which are structured hierarchically to explain visual prompts at different granularities. Comprehensive qualitative and quantitative evaluations on fine-grained classification benchmarks show its superior interpretability and performance over conventional visual prompt tuning methods and existing interpretable methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06084v1-abstract-full').style.display = 'none'; document.getElementById('2503.06084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository