Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,106 results for author: <span class="mathjax">Xu, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xu%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xu, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xu%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xu, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23804">arXiv:2503.23804</a> <span> [<a href="https://arxiv.org/pdf/2503.23804">pdf</a>, <a href="https://arxiv.org/format/2503.23804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Get the Agents Drunk: Memory Perturbations in Autonomous Agent-based Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shiyi Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhibo Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiwei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Liming Zhu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lina Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23804v1-abstract-short" style="display: inline;"> Large language model-based agents are increasingly used in recommender systems (Agent4RSs) to achieve personalized behavior modeling. Specifically, Agent4RSs introduces memory mechanisms that enable the agents to autonomously learn and self-evolve from real-world interactions. However, to the best of our knowledge, how robust Agent4RSs are remains unexplored. As such, in this paper, we propose the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23804v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23804v1-abstract-full" style="display: none;"> Large language model-based agents are increasingly used in recommender systems (Agent4RSs) to achieve personalized behavior modeling. Specifically, Agent4RSs introduces memory mechanisms that enable the agents to autonomously learn and self-evolve from real-world interactions. However, to the best of our knowledge, how robust Agent4RSs are remains unexplored. As such, in this paper, we propose the first work to attack Agent4RSs by perturbing agents' memories, not only to uncover their limitations but also to enhance their security and robustness, ensuring the development of safer and more reliable AI agents. Given the security and privacy concerns, it is more practical to launch attacks under a black-box setting, where the accurate knowledge of the victim models cannot be easily obtained. Moreover, the practical attacks are often stealthy to maximize the impact. To this end, we propose a novel practical attack framework named DrunkAgent. DrunkAgent consists of a generation module, a strategy module, and a surrogate module. The generation module aims to produce effective and coherent adversarial textual triggers, which can be used to achieve attack objectives such as promoting the target items. The strategy module is designed to `get the target agents drunk' so that their memories cannot be effectively updated during the interaction process. As such, the triggers can play the best role. Both of the modules are optimized on the surrogate module to improve the transferability and imperceptibility of the attacks. By identifying and analyzing the vulnerabilities, our work provides critical insights that pave the way for building safer and more resilient Agent4RSs. Extensive experiments across various real-world datasets demonstrate the effectiveness of DrunkAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23804v1-abstract-full').style.display = 'none'; document.getElementById('2503.23804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23776">arXiv:2503.23776</a> <span> [<a href="https://arxiv.org/pdf/2503.23776">pdf</a>, <a href="https://arxiv.org/format/2503.23776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> VIDEX: A Disaggregated and Extensible Virtual Index for the Cloud and AI Era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+R">Rong Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieying Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xianghong Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Linhui Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhimin Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Rui Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianjun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23776v1-abstract-short" style="display: inline;"> Virtual index, also known as hypothetical indexes, play a crucial role in database query optimization. However, with the rapid advancement of cloud computing and AI-driven models for database optimization, traditional virtual index approaches face significant challenges. Cloud-native environments often prohibit direct conducting query optimization process on production databases due to stability r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23776v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23776v1-abstract-full" style="display: none;"> Virtual index, also known as hypothetical indexes, play a crucial role in database query optimization. However, with the rapid advancement of cloud computing and AI-driven models for database optimization, traditional virtual index approaches face significant challenges. Cloud-native environments often prohibit direct conducting query optimization process on production databases due to stability requirements and data privacy concerns. Moreover, while AI models show promising progress, their integration with database systems poses challenges in system complexity, inference acceleration, and model hot updates. In this paper, we present VIDEX, a three-layer disaggregated architecture that decouples database instances, the virtual index optimizer, and algorithm services, providing standardized interfaces for AI model integration. Users can configure VIDEX by either collecting production statistics or by loading from a prepared file; this setup allows for high-accurate what-if analyses based on virtual indexes, achieving query plans that are identical to those of the production instance. Additionally, users can freely integrate new AI-driven algorithms into VIDEX. VIDEX has been successfully deployed at ByteDance, serving thousands of MySQL instances daily and over millions of SQL queries for index optimization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23776v1-abstract-full').style.display = 'none'; document.getElementById('2503.23776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23297">arXiv:2503.23297</a> <span> [<a href="https://arxiv.org/pdf/2503.23297">pdf</a>, <a href="https://arxiv.org/format/2503.23297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ReasonGrounder: LVLM-Guided Hierarchical Feature Splatting for Open-Vocabulary 3D Visual Grounding and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenyang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sixiao Zheng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tongying Pan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Longfei Liang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23297v1-abstract-short" style="display: inline;"> Open-vocabulary 3D visual grounding and reasoning aim to localize objects in a scene based on implicit language descriptions, even when they are occluded. This ability is crucial for tasks such as vision-language navigation and autonomous robotics. However, current methods struggle because they rely heavily on fine-tuning with 3D annotations and mask proposals, which limits their ability to handle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23297v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23297v1-abstract-full" style="display: none;"> Open-vocabulary 3D visual grounding and reasoning aim to localize objects in a scene based on implicit language descriptions, even when they are occluded. This ability is crucial for tasks such as vision-language navigation and autonomous robotics. However, current methods struggle because they rely heavily on fine-tuning with 3D annotations and mask proposals, which limits their ability to handle diverse semantics and common knowledge required for effective reasoning. In this work, we propose ReasonGrounder, an LVLM-guided framework that uses hierarchical 3D feature Gaussian fields for adaptive grouping based on physical scale, enabling open-vocabulary 3D grounding and reasoning. ReasonGrounder interprets implicit instructions using large vision-language models (LVLM) and localizes occluded objects through 3D Gaussian splatting. By incorporating 2D segmentation masks from the SAM and multi-view CLIP embeddings, ReasonGrounder selects Gaussian groups based on object scale, enabling accurate localization through both explicit and implicit language understanding, even in novel, occluded views. We also contribute ReasoningGD, a new dataset containing over 10K scenes and 2 million annotations for evaluating open-vocabulary 3D grounding and amodal perception under occlusion. Experiments show that ReasonGrounder significantly improves 3D grounding accuracy in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23297v1-abstract-full').style.display = 'none'; document.getElementById('2503.23297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23179">arXiv:2503.23179</a> <span> [<a href="https://arxiv.org/pdf/2503.23179">pdf</a>, <a href="https://arxiv.org/format/2503.23179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OncoReg: Medical Image Registration for Oncological Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heyer%2C+W">Wiebke Heyer</a>, <a href="/search/cs?searchtype=author&query=Elser%2C+Y">Yannic Elser</a>, <a href="/search/cs?searchtype=author&query=Berkel%2C+L">Lennart Berkel</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinrui Song</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuanang Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pingkun Yan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xi Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zi Li</a>, <a href="/search/cs?searchtype=author&query=Mok%2C+T+C+W">Tony C. W. Mok</a>, <a href="/search/cs?searchtype=author&query=LI%2C+B">BoWen LI</a>, <a href="/search/cs?searchtype=author&query=Staackmann%2C+C">Christian Staackmann</a>, <a href="/search/cs?searchtype=author&query=Gro%C3%9Fbr%C3%B6hmer%2C+C">Christoph Gro脽br枚hmer</a>, <a href="/search/cs?searchtype=author&query=Hering%2C+A">Alessa Hering</a>, <a href="/search/cs?searchtype=author&query=Sieren%2C+M+M">Malte M. Sieren</a>, <a href="/search/cs?searchtype=author&query=Heinrich%2C+M+P">Mattias P. Heinrich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23179v1-abstract-short" style="display: inline;"> In modern cancer research, the vast volume of medical data generated is often underutilised due to challenges related to patient privacy. The OncoReg Challenge addresses this issue by enabling researchers to develop and validate image registration methods through a two-phase framework that ensures patient privacy while fostering the development of more generalisable AI models. Phase one involves w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23179v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23179v1-abstract-full" style="display: none;"> In modern cancer research, the vast volume of medical data generated is often underutilised due to challenges related to patient privacy. The OncoReg Challenge addresses this issue by enabling researchers to develop and validate image registration methods through a two-phase framework that ensures patient privacy while fostering the development of more generalisable AI models. Phase one involves working with a publicly available dataset, while phase two focuses on training models on a private dataset within secure hospital networks. OncoReg builds upon the foundation established by the Learn2Reg Challenge by incorporating the registration of interventional cone-beam computed tomography (CBCT) with standard planning fan-beam CT (FBCT) images in radiotherapy. Accurate image registration is crucial in oncology, particularly for dynamic treatment adjustments in image-guided radiotherapy, where precise alignment is necessary to minimise radiation exposure to healthy tissues while effectively targeting tumours. This work details the methodology and data behind the OncoReg Challenge and provides a comprehensive analysis of the competition entries and results. Findings reveal that feature extraction plays a pivotal role in this registration task. A new method emerging from this challenge demonstrated its versatility, while established approaches continue to perform comparably to newer techniques. Both deep learning and classical approaches still play significant roles in image registration, with the combination of methods - particularly in feature extraction - proving most effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23179v1-abstract-full').style.display = 'none'; document.getElementById('2503.23179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22984">arXiv:2503.22984</a> <span> [<a href="https://arxiv.org/pdf/2503.22984">pdf</a>, <a href="https://arxiv.org/format/2503.22984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimal Transport-Guided Source-Free Adaptation for Face Anti-Spoofing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihua Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanbai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qin Zhang</a>, <a href="/search/cs?searchtype=author&query=Bergamo%2C+A">Alessandro Bergamo</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A+K">Anil K. Jain</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yifan Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22984v1-abstract-short" style="display: inline;"> Developing a face anti-spoofing model that meets the security requirements of clients worldwide is challenging due to the domain gap between training datasets and diverse end-user test data. Moreover, for security and privacy reasons, it is undesirable for clients to share a large amount of their face data with service providers. In this work, we introduce a novel method in which the face anti-spo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22984v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22984v1-abstract-full" style="display: none;"> Developing a face anti-spoofing model that meets the security requirements of clients worldwide is challenging due to the domain gap between training datasets and diverse end-user test data. Moreover, for security and privacy reasons, it is undesirable for clients to share a large amount of their face data with service providers. In this work, we introduce a novel method in which the face anti-spoofing model can be adapted by the client itself to a target domain at test time using only a small sample of data while keeping model parameters and training data inaccessible to the client. Specifically, we develop a prototype-based base model and an optimal transport-guided adaptor that enables adaptation in either a lightweight training or training-free fashion, without updating base model's parameters. Furthermore, we propose geodesic mixup, an optimal transport-based synthesis method that generates augmented training data along the geodesic path between source prototypes and target data distribution. This allows training a lightweight classifier to effectively adapt to target-specific characteristics while retaining essential knowledge learned from the source domain. In cross-domain and cross-attack settings, compared with recent methods, our method achieves average relative improvements of 19.17% in HTER and 8.58% in AUC, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22984v1-abstract-full').style.display = 'none'; document.getElementById('2503.22984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.4; I.2.10; I.4.8; I.2.6; C.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21679">arXiv:2503.21679</a> <span> [<a href="https://arxiv.org/pdf/2503.21679">pdf</a>, <a href="https://arxiv.org/format/2503.21679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> JiraiBench: A Bilingual Benchmark for Evaluating Large Language Models' Detection of Human Self-Destructive Behavior Content in Jirai Community </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yunze Xiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tingyu He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L+Z">Lionel Z. Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yiming Ma</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xingyu Song</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaohang Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+I">Irene Li</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+K+C">Ka Chung Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21679v2-abstract-short" style="display: inline;"> This paper introduces JiraiBench, the first bilingual benchmark for evaluating large language models' effectiveness in detecting self-destructive content across Chinese and Japanese social media communities. Focusing on the transnational "Jirai" (landmine) online subculture that encompasses multiple forms of self-destructive behaviors including drug overdose, eating disorders, and self-harm, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21679v2-abstract-full').style.display = 'inline'; document.getElementById('2503.21679v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21679v2-abstract-full" style="display: none;"> This paper introduces JiraiBench, the first bilingual benchmark for evaluating large language models' effectiveness in detecting self-destructive content across Chinese and Japanese social media communities. Focusing on the transnational "Jirai" (landmine) online subculture that encompasses multiple forms of self-destructive behaviors including drug overdose, eating disorders, and self-harm, we present a comprehensive evaluation framework incorporating both linguistic and cultural dimensions. Our dataset comprises 10,419 Chinese posts and 5,000 Japanese posts with multidimensional annotation along three behavioral categories, achieving substantial inter-annotator agreement. Experimental evaluations across four state-of-the-art models reveal significant performance variations based on instructional language, with Japanese prompts unexpectedly outperforming Chinese prompts when processing Chinese content. This emergent cross-cultural transfer suggests that cultural proximity can sometimes outweigh linguistic similarity in detection tasks. Cross-lingual transfer experiments with fine-tuned models further demonstrate the potential for knowledge transfer between these language systems without explicit target language training. These findings highlight the need for culturally-informed approaches to multilingual content moderation and provide empirical evidence for the importance of cultural context in developing more effective detection systems for vulnerable online communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21679v2-abstract-full').style.display = 'none'; document.getElementById('2503.21679v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 1 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21156">arXiv:2503.21156</a> <span> [<a href="https://arxiv.org/pdf/2503.21156">pdf</a>, <a href="https://arxiv.org/format/2503.21156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> A Theoretical Analysis of Analogy-Based Evolutionary Transfer Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiaoming Xue</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Liang Feng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yinglan Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21156v1-abstract-short" style="display: inline;"> Evolutionary transfer optimization (ETO) has been gaining popularity in research over the years due to its outstanding knowledge transfer ability to address various challenges in optimization. However, a pressing issue in this field is that the invention of new ETO algorithms has far outpaced the development of fundamental theories needed to clearly understand the key factors contributing to the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21156v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21156v1-abstract-full" style="display: none;"> Evolutionary transfer optimization (ETO) has been gaining popularity in research over the years due to its outstanding knowledge transfer ability to address various challenges in optimization. However, a pressing issue in this field is that the invention of new ETO algorithms has far outpaced the development of fundamental theories needed to clearly understand the key factors contributing to the success of these algorithms for effective generalization. In response to this challenge, this study aims to establish theoretical foundations for analogy-based ETO, specifically to support various algorithms that frequently reference a key concept known as similarity. First, we introduce analogical reasoning and link its subprocesses to three key issues in ETO. Then, we develop theories for analogy-based knowledge transfer, rooted in the principles that underlie the subprocesses. Afterwards, we present two theorems related to the performance gain of analogy-based knowledge transfer, namely unconditionally nonnegative performance gain and conditionally positive performance gain, to theoretically demonstrate the effectiveness of various analogy-based ETO methods. Last but not least, we offer a novel insight into analogy-based ETO that interprets its conditional superiority over traditional evolutionary optimization through the lens of the no free lunch theorem for optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21156v1-abstract-full').style.display = 'none'; document.getElementById('2503.21156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20981">arXiv:2503.20981</a> <span> [<a href="https://arxiv.org/pdf/2503.20981">pdf</a>, <a href="https://arxiv.org/format/2503.20981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Patients Speak, AI Listens: LLM-based Analysis of Online Reviews Uncovers Key Drivers for Urgent Care Satisfaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaoran Xu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhaoqian Xue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Medri%2C+J">Jhonatan Medri</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Junjie Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiayan Zhou</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jin Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Siyuan Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingyao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20981v1-abstract-short" style="display: inline;"> Investigating the public experience of urgent care facilities is essential for promoting community healthcare development. Traditional survey methods often fall short due to limited scope, time, and spatial coverage. Crowdsourcing through online reviews or social media offers a valuable approach to gaining such insights. With recent advancements in large language models (LLMs), extracting nuanced… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20981v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20981v1-abstract-full" style="display: none;"> Investigating the public experience of urgent care facilities is essential for promoting community healthcare development. Traditional survey methods often fall short due to limited scope, time, and spatial coverage. Crowdsourcing through online reviews or social media offers a valuable approach to gaining such insights. With recent advancements in large language models (LLMs), extracting nuanced perceptions from reviews has become feasible. This study collects Google Maps reviews across the DMV and Florida areas and conducts prompt engineering with the GPT model to analyze the aspect-based sentiment of urgent care. We first analyze the geospatial patterns of various aspects, including interpersonal factors, operational efficiency, technical quality, finances, and facilities. Next, we determine Census Block Group(CBG)-level characteristics underpinning differences in public perception, including population density, median income, GINI Index, rent-to-income ratio, household below poverty rate, no insurance rate, and unemployment rate. Our results show that interpersonal factors and operational efficiency emerge as the strongest determinants of patient satisfaction in urgent care, while technical quality, finances, and facilities show no significant independent effects when adjusted for in multivariate models. Among socioeconomic and demographic factors, only population density demonstrates a significant but modest association with patient ratings, while the remaining factors exhibit no significant correlations. Overall, this study highlights the potential of crowdsourcing to uncover the key factors that matter to residents and provide valuable insights for stakeholders to improve public satisfaction with urgent care. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20981v1-abstract-full').style.display = 'none'; document.getElementById('2503.20981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20462">arXiv:2503.20462</a> <span> [<a href="https://arxiv.org/pdf/2503.20462">pdf</a>, <a href="https://arxiv.org/format/2503.20462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Multi-agent Uncertainty-Aware Pessimistic Model-Based Reinforcement Learning for Connected Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+R">Ruoqi Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rongpeng Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xing Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhifeng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20462v1-abstract-short" style="display: inline;"> Deep Reinforcement Learning (DRL) holds significant promise for achieving human-like Autonomous Vehicle (AV) capabilities, but suffers from low sample efficiency and challenges in reward design. Model-Based Reinforcement Learning (MBRL) offers improved sample efficiency and generalizability compared to Model-Free Reinforcement Learning (MFRL) in various multi-agent decision-making scenarios. Never… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20462v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20462v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20462v1-abstract-full" style="display: none;"> Deep Reinforcement Learning (DRL) holds significant promise for achieving human-like Autonomous Vehicle (AV) capabilities, but suffers from low sample efficiency and challenges in reward design. Model-Based Reinforcement Learning (MBRL) offers improved sample efficiency and generalizability compared to Model-Free Reinforcement Learning (MFRL) in various multi-agent decision-making scenarios. Nevertheless, MBRL faces critical difficulties in estimating uncertainty during the model learning phase, thereby limiting its scalability and applicability in real-world scenarios. Additionally, most Connected Autonomous Vehicle (CAV) studies focus on single-agent decision-making, while existing multi-agent MBRL solutions lack computationally tractable algorithms with Probably Approximately Correct (PAC) guarantees, an essential factor for ensuring policy reliability with limited training data. To address these challenges, we propose MA-PMBRL, a novel Multi-Agent Pessimistic Model-Based Reinforcement Learning framework for CAVs, incorporating a max-min optimization approach to enhance robustness and decision-making. To mitigate the inherent subjectivity of uncertainty estimation in MBRL and avoid incurring catastrophic failures in AV, MA-PMBRL employs a pessimistic optimization framework combined with Projected Gradient Descent (PGD) for both model and policy learning. MA-PMBRL also employs general function approximations under partial dataset coverage to enhance learning efficiency and system-level performance. By bounding the suboptimality of the resulting policy under mild theoretical assumptions, we successfully establish PAC guarantees for MA-PMBRL, demonstrating that the proposed framework represents a significant step toward scalable, efficient, and reliable multi-agent decision-making for CAVs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20462v1-abstract-full').style.display = 'none'; document.getElementById('2503.20462v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20314">arXiv:2503.20314</a> <span> [<a href="https://arxiv.org/pdf/2503.20314">pdf</a>, <a href="https://arxiv.org/format/2503.20314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Wan: Open and Advanced Large-Scale Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=WanTeam"> WanTeam</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Baole Ai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bin Wen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chen-Wei Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feiwu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiming Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jianyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Keyu Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianghua Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengyang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pandeng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pingyu Wu</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20314v1-abstract-short" style="display: inline;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20314v1-abstract-full" style="display: none;"> This report presents Wan, a comprehensive and open suite of video foundation models designed to push the boundaries of video generation. Built upon the mainstream diffusion transformer paradigm, Wan achieves significant advancements in generative capabilities through a series of innovations, including our novel VAE, scalable pre-training strategies, large-scale data curation, and automated evaluation metrics. These contributions collectively enhance the model's performance and versatility. Specifically, Wan is characterized by four key features: Leading Performance: The 14B model of Wan, trained on a vast dataset comprising billions of images and videos, demonstrates the scaling laws of video generation with respect to both data and model size. It consistently outperforms the existing open-source models as well as state-of-the-art commercial solutions across multiple internal and external benchmarks, demonstrating a clear and significant performance superiority. Comprehensiveness: Wan offers two capable models, i.e., 1.3B and 14B parameters, for efficiency and effectiveness respectively. It also covers multiple downstream applications, including image-to-video, instruction-guided video editing, and personal video generation, encompassing up to eight tasks. Consumer-Grade Efficiency: The 1.3B model demonstrates exceptional resource efficiency, requiring only 8.19 GB VRAM, making it compatible with a wide range of consumer-grade GPUs. Openness: We open-source the entire series of Wan, including source code and all models, with the goal of fostering the growth of the video generation community. This openness seeks to significantly expand the creative possibilities of video production in the industry and provide academia with high-quality video foundation models. All the code and models are available at https://github.com/Wan-Video/Wan2.1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20314v1-abstract-full').style.display = 'none'; document.getElementById('2503.20314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 pages, 33 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19916">arXiv:2503.19916</a> <span> [<a href="https://arxiv.org/pdf/2503.19916">pdf</a>, <a href="https://arxiv.org/format/2503.19916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> EventFly: Event Camera Perception from Ground to the Sky </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingdong Kong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dongyue Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiang Xu</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+L+X">Lai Xing Ng</a>, <a href="/search/cs?searchtype=author&query=Ooi%2C+W+T">Wei Tsang Ooi</a>, <a href="/search/cs?searchtype=author&query=Cottereau%2C+B+R">Benoit R. Cottereau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19916v1-abstract-short" style="display: inline;"> Cross-platform adaptation in event-based dense perception is crucial for deploying event cameras across diverse settings, such as vehicles, drones, and quadrupeds, each with unique motion dynamics, viewpoints, and class distributions. In this work, we introduce EventFly, a framework for robust cross-platform adaptation in event camera perception. Our approach comprises three key components: i) Eve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19916v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19916v1-abstract-full" style="display: none;"> Cross-platform adaptation in event-based dense perception is crucial for deploying event cameras across diverse settings, such as vehicles, drones, and quadrupeds, each with unique motion dynamics, viewpoints, and class distributions. In this work, we introduce EventFly, a framework for robust cross-platform adaptation in event camera perception. Our approach comprises three key components: i) Event Activation Prior (EAP), which identifies high-activation regions in the target domain to minimize prediction entropy, fostering confident, domain-adaptive predictions; ii) EventBlend, a data-mixing strategy that integrates source and target event voxel grids based on EAP-driven similarity and density maps, enhancing feature alignment; and iii) EventMatch, a dual-discriminator technique that aligns features from source, target, and blended domains for better domain-invariant learning. To holistically assess cross-platform adaptation abilities, we introduce EXPo, a large-scale benchmark with diverse samples across vehicle, drone, and quadruped platforms. Extensive experiments validate our effectiveness, demonstrating substantial gains over popular adaptation methods. We hope this work can pave the way for more adaptive, high-performing event perception across diverse and complex environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19916v1-abstract-full').style.display = 'none'; document.getElementById('2503.19916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025; 30 pages, 8 figures, 16 tables; Project Page at https://event-fly.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19912">arXiv:2503.19912</a> <span> [<a href="https://arxiv.org/pdf/2503.19912">pdf</a>, <a href="https://arxiv.org/format/2503.19912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SuperFlow++: Enhanced Spatiotemporal Consistency for Cross-Modal Data Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiang Xu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingdong Kong</a>, <a href="/search/cs?searchtype=author&query=Shuai%2C+H">Hui Shuai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liang Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingshan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19912v1-abstract-short" style="display: inline;"> LiDAR representation learning has emerged as a promising approach to reducing reliance on costly and labor-intensive human annotations. While existing methods primarily focus on spatial alignment between LiDAR and camera sensors, they often overlook the temporal dynamics critical for capturing motion and scene continuity in driving scenarios. To address this limitation, we propose SuperFlow++, a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19912v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19912v1-abstract-full" style="display: none;"> LiDAR representation learning has emerged as a promising approach to reducing reliance on costly and labor-intensive human annotations. While existing methods primarily focus on spatial alignment between LiDAR and camera sensors, they often overlook the temporal dynamics critical for capturing motion and scene continuity in driving scenarios. To address this limitation, we propose SuperFlow++, a novel framework that integrates spatiotemporal cues in both pretraining and downstream tasks using consecutive LiDAR-camera pairs. SuperFlow++ introduces four key components: (1) a view consistency alignment module to unify semantic information across camera views, (2) a dense-to-sparse consistency regularization mechanism to enhance feature robustness across varying point cloud densities, (3) a flow-based contrastive learning approach that models temporal relationships for improved scene understanding, and (4) a temporal voting strategy that propagates semantic information across LiDAR scans to improve prediction consistency. Extensive evaluations on 11 heterogeneous LiDAR datasets demonstrate that SuperFlow++ outperforms state-of-the-art methods across diverse tasks and driving conditions. Furthermore, by scaling both 2D and 3D backbones during pretraining, we uncover emergent properties that provide deeper insights into developing scalable 3D foundation models. With strong generalizability and computational efficiency, SuperFlow++ establishes a new benchmark for data-efficient LiDAR-based perception in autonomous driving. The code is publicly available at https://github.com/Xiangxu-0103/SuperFlow <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19912v1-abstract-full').style.display = 'none'; document.getElementById('2503.19912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint; 15 pages, 6 figures, 10 tables; Code at https://github.com/Xiangxu-0103/SuperFlow</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19791">arXiv:2503.19791</a> <span> [<a href="https://arxiv.org/pdf/2503.19791">pdf</a>, <a href="https://arxiv.org/format/2503.19791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SITA: Structurally Imperceptible and Transferable Adversarial Attacks for Stylized Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jingdan Kang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoxin Yang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yan Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuemiao Xu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yong Du</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengfeng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19791v1-abstract-short" style="display: inline;"> Image generation technology has brought significant advancements across various fields but has also raised concerns about data misuse and potential rights infringements, particularly with respect to creating visual artworks. Current methods aimed at safeguarding artworks often employ adversarial attacks. However, these methods face challenges such as poor transferability, high computational costs,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19791v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19791v1-abstract-full" style="display: none;"> Image generation technology has brought significant advancements across various fields but has also raised concerns about data misuse and potential rights infringements, particularly with respect to creating visual artworks. Current methods aimed at safeguarding artworks often employ adversarial attacks. However, these methods face challenges such as poor transferability, high computational costs, and the introduction of noticeable noise, which compromises the aesthetic quality of the original artwork. To address these limitations, we propose a Structurally Imperceptible and Transferable Adversarial (SITA) attacks. SITA leverages a CLIP-based destylization loss, which decouples and disrupts the robust style representation of the image. This disruption hinders style extraction during stylized image generation, thereby impairing the overall stylization process. Importantly, SITA eliminates the need for a surrogate diffusion model, leading to significantly reduced computational overhead. The method's robust style feature disruption ensures high transferability across diverse models. Moreover, SITA introduces perturbations by embedding noise within the imperceptible structural details of the image. This approach effectively protects against style extraction without compromising the visual quality of the artwork. Extensive experiments demonstrate that SITA offers superior protection for artworks against unauthorized use in stylized generation. It significantly outperforms existing methods in terms of transferability, computational efficiency, and noise imperceptibility. Code is available at https://github.com/A-raniy-day/SITA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19791v1-abstract-full').style.display = 'none'; document.getElementById('2503.19791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19416">arXiv:2503.19416</a> <span> [<a href="https://arxiv.org/pdf/2503.19416">pdf</a>, <a href="https://arxiv.org/format/2503.19416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EmoHead: Emotional Talking Head via Manipulating Semantic Expression Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xuli Shen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hua Cai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dingding Yu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weilin Shen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qing Xu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19416v1-abstract-short" style="display: inline;"> Generating emotion-specific talking head videos from audio input is an important and complex challenge for human-machine interaction. However, emotion is highly abstract concept with ambiguous boundaries, and it necessitates disentangled expression parameters to generate emotionally expressive talking head videos. In this work, we present EmoHead to synthesize talking head videos via semantic expr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19416v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19416v1-abstract-full" style="display: none;"> Generating emotion-specific talking head videos from audio input is an important and complex challenge for human-machine interaction. However, emotion is highly abstract concept with ambiguous boundaries, and it necessitates disentangled expression parameters to generate emotionally expressive talking head videos. In this work, we present EmoHead to synthesize talking head videos via semantic expression parameters. To predict expression parameter for arbitrary audio input, we apply an audio-expression module that can be specified by an emotion tag. This module aims to enhance correlation from audio input across various emotions. Furthermore, we leverage pre-trained hyperplane to refine facial movements by probing along the vertical direction. Finally, the refined expression parameters regularize neural radiance fields and facilitate the emotion-consistent generation of talking head videos. Experimental results demonstrate that semantic expression parameters lead to better reconstruction quality and controllability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19416v1-abstract-full').style.display = 'none'; document.getElementById('2503.19416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19328">arXiv:2503.19328</a> <span> [<a href="https://arxiv.org/pdf/2503.19328">pdf</a>, <a href="https://arxiv.org/format/2503.19328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Substance over Style: Evaluating Proactive Conversational Coaching Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Srinivas%2C+V">Vidya Srinivas</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuhai Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Ayush%2C+K">Kumar Ayush</a>, <a href="/search/cs?searchtype=author&query=Galatzer-Levy%2C+I">Isaac Galatzer-Levy</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+S">Shwetak Patel</a>, <a href="/search/cs?searchtype=author&query=McDuff%2C+D">Daniel McDuff</a>, <a href="/search/cs?searchtype=author&query=Althoff%2C+T">Tim Althoff</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19328v1-abstract-short" style="display: inline;"> While NLP research has made strides in conversational tasks, many approaches focus on single-turn responses with well-defined objectives or evaluation criteria. In contrast, coaching presents unique challenges with initially undefined goals that evolve through multi-turn interactions, subjective evaluation criteria, mixed-initiative dialogue. In this work, we describe and implement five multi-turn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19328v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19328v1-abstract-full" style="display: none;"> While NLP research has made strides in conversational tasks, many approaches focus on single-turn responses with well-defined objectives or evaluation criteria. In contrast, coaching presents unique challenges with initially undefined goals that evolve through multi-turn interactions, subjective evaluation criteria, mixed-initiative dialogue. In this work, we describe and implement five multi-turn coaching agents that exhibit distinct conversational styles, and evaluate them through a user study, collecting first-person feedback on 155 conversations. We find that users highly value core functionality, and that stylistic components in absence of core components are viewed negatively. By comparing user feedback with third-person evaluations from health experts and an LM, we reveal significant misalignment across evaluation approaches. Our findings provide insights into design and evaluation of conversational coaching agents and contribute toward improving human-centered NLP applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19328v1-abstract-full').style.display = 'none'; document.getElementById('2503.19328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18760">arXiv:2503.18760</a> <span> [<a href="https://arxiv.org/pdf/2503.18760">pdf</a>, <a href="https://arxiv.org/format/2503.18760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Synthetic Function Demonstrations Improve Generation in Low-Resource Programming Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McKenna%2C+N">Nick McKenna</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinnuo Xu</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+J">Jack Williams</a>, <a href="/search/cs?searchtype=author&query=Wilson%2C+N">Nick Wilson</a>, <a href="/search/cs?searchtype=author&query=Van+Durme%2C+B">Benjamin Van Durme</a>, <a href="/search/cs?searchtype=author&query=Poelitz%2C+C">Christian Poelitz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18760v1-abstract-short" style="display: inline;"> A key consideration when training an LLM is whether the target language is more or less resourced, whether this is English compared to Welsh, or Python compared to Excel. Typical training data for programming languages consist of real program demonstrations coupled with human-written comments. Here we present novel approaches to the creation of such data for low resource programming languages. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18760v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18760v1-abstract-full" style="display: none;"> A key consideration when training an LLM is whether the target language is more or less resourced, whether this is English compared to Welsh, or Python compared to Excel. Typical training data for programming languages consist of real program demonstrations coupled with human-written comments. Here we present novel approaches to the creation of such data for low resource programming languages. We generate fully-synthetic, textbook-quality demonstrations of common library functions in an example domain of Excel formulas, using a teacher model. We then finetune an underperforming student model, and show improvement on 2 question-answering datasets recast into the Excel domain. We show advantages of finetuning over standard, off-the-shelf RAG approaches, which can offer only modest improvement due to the unfamiliar target domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18760v1-abstract-full').style.display = 'none'; document.getElementById('2503.18760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18454">arXiv:2503.18454</a> <span> [<a href="https://arxiv.org/pdf/2503.18454">pdf</a>, <a href="https://arxiv.org/format/2503.18454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InPO: Inversion Preference Optimization with Reparametrized DDIM for Efficient Diffusion Model Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yunhong Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qichao Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hengyuan Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xierui Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaoyin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18454v1-abstract-short" style="display: inline;"> Without using explicit reward, direct preference optimization (DPO) employs paired human preference data to fine-tune generative models, a method that has garnered considerable attention in large language models (LLMs). However, exploration of aligning text-to-image (T2I) diffusion models with human preferences remains limited. In comparison to supervised fine-tuning, existing methods that align d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18454v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18454v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18454v1-abstract-full" style="display: none;"> Without using explicit reward, direct preference optimization (DPO) employs paired human preference data to fine-tune generative models, a method that has garnered considerable attention in large language models (LLMs). However, exploration of aligning text-to-image (T2I) diffusion models with human preferences remains limited. In comparison to supervised fine-tuning, existing methods that align diffusion model suffer from low training efficiency and subpar generation quality due to the long Markov chain process and the intractability of the reverse process. To address these limitations, we introduce DDIM-InPO, an efficient method for direct preference alignment of diffusion models. Our approach conceptualizes diffusion model as a single-step generative model, allowing us to fine-tune the outputs of specific latent variables selectively. In order to accomplish this objective, we first assign implicit rewards to any latent variable directly via a reparameterization technique. Then we construct an Inversion technique to estimate appropriate latent variables for preference optimization. This modification process enables the diffusion model to only fine-tune the outputs of latent variables that have a strong correlation with the preference dataset. Experimental results indicate that our DDIM-InPO achieves state-of-the-art performance with just 400 steps of fine-tuning, surpassing all preference aligning baselines for T2I diffusion models in human preference evaluation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18454v1-abstract-full').style.display = 'none'; document.getElementById('2503.18454v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18393">arXiv:2503.18393</a> <span> [<a href="https://arxiv.org/pdf/2503.18393">pdf</a>, <a href="https://arxiv.org/format/2503.18393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PDDM: Pseudo Depth Diffusion Model for RGB-PD Semantic Segmentation Based in Complex Indoor Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinhua Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianbing Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinfu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18393v1-abstract-short" style="display: inline;"> The integration of RGB and depth modalities significantly enhances the accuracy of segmenting complex indoor scenes, with depth data from RGB-D cameras playing a crucial role in this improvement. However, collecting an RGB-D dataset is more expensive than an RGB dataset due to the need for specialized depth sensors. Aligning depth and RGB images also poses challenges due to sensor positioning and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18393v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18393v1-abstract-full" style="display: none;"> The integration of RGB and depth modalities significantly enhances the accuracy of segmenting complex indoor scenes, with depth data from RGB-D cameras playing a crucial role in this improvement. However, collecting an RGB-D dataset is more expensive than an RGB dataset due to the need for specialized depth sensors. Aligning depth and RGB images also poses challenges due to sensor positioning and issues like missing data and noise. In contrast, Pseudo Depth (PD) from high-precision depth estimation algorithms can eliminate the dependence on RGB-D sensors and alignment processes, as well as provide effective depth information and show significant potential in semantic segmentation. Therefore, to explore the practicality of utilizing pseudo depth instead of real depth for semantic segmentation, we design an RGB-PD segmentation pipeline to integrate RGB and pseudo depth and propose a Pseudo Depth Aggregation Module (PDAM) for fully exploiting the informative clues provided by the diverse pseudo depth maps. The PDAM aggregates multiple pseudo depth maps into a single modality, making it easily adaptable to other RGB-D segmentation methods. In addition, the pre-trained diffusion model serves as a strong feature extractor for RGB segmentation tasks, but multi-modal diffusion-based segmentation methods remain unexplored. Therefore, we present a Pseudo Depth Diffusion Model (PDDM) that adopts a large-scale text-image diffusion model as a feature extractor and a simple yet effective fusion strategy to integrate pseudo depth. To verify the applicability of pseudo depth and our PDDM, we perform extensive experiments on the NYUv2 and SUNRGB-D datasets. The experimental results demonstrate that pseudo depth can effectively enhance segmentation performance, and our PDDM achieves state-of-the-art performance, outperforming other methods by +6.98 mIoU on NYUv2 and +2.11 mIoU on SUNRGB-D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18393v1-abstract-full').style.display = 'none'; document.getElementById('2503.18393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18147">arXiv:2503.18147</a> <span> [<a href="https://arxiv.org/pdf/2503.18147">pdf</a>, <a href="https://arxiv.org/format/2503.18147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PHT-CAD: Efficient CAD Parametric Primitive Analysis with Progressive Hierarchical Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+K">Ke Niu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuwen Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuofan Chen</a>, <a href="/search/cs?searchtype=author&query=Que%2C+X">Xianghui Que</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18147v2-abstract-short" style="display: inline;"> Computer-Aided Design (CAD) plays a pivotal role in industrial manufacturing, yet 2D Parametric Primitive Analysis (PPA) remains underexplored due to two key challenges: structural constraint reasoning and advanced semantic understanding. To tackle these challenges, we first propose an Efficient Hybrid Parametrization (EHP) for better representing 2D engineering drawings. EHP contains four types o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18147v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18147v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18147v2-abstract-full" style="display: none;"> Computer-Aided Design (CAD) plays a pivotal role in industrial manufacturing, yet 2D Parametric Primitive Analysis (PPA) remains underexplored due to two key challenges: structural constraint reasoning and advanced semantic understanding. To tackle these challenges, we first propose an Efficient Hybrid Parametrization (EHP) for better representing 2D engineering drawings. EHP contains four types of atomic component i.e., point, line, circle, and arc). Additionally, we propose PHT-CAD, a novel 2D PPA framework that harnesses the modality alignment and reasoning capabilities of Vision-Language Models (VLMs) for precise engineering drawing analysis. In PHT-CAD, we introduce four dedicated regression heads to predict corresponding atomic components. To train PHT-CAD, a three-stage training paradigm Progressive Hierarchical Tuning (PHT) is proposed to progressively enhance PHT-CAD's capability to perceive individual primitives, infer structural constraints, and align annotation layers with their corresponding geometric representations. Considering that existing datasets lack complete annotation layers and real-world engineering drawings, we introduce ParaCAD, the first large-scale benchmark that explicitly integrates both the geometric and annotation layers. ParaCAD comprises over 10 million annotated drawings for training and 3,000 real-world industrial drawings with complex topological structures and physical constraints for test. Extensive experiments demonstrate the effectiveness of PHT-CAD and highlight the practical significance of ParaCAD in advancing 2D PPA research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18147v2-abstract-full').style.display = 'none'; document.getElementById('2503.18147v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17994">arXiv:2503.17994</a> <span> [<a href="https://arxiv.org/pdf/2503.17994">pdf</a>, <a href="https://arxiv.org/format/2503.17994">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Instructing the Architecture Search for Spatial-temporal Sequence Forecasting with LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xin Xue</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haoyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yizhou Long</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17994v1-abstract-short" style="display: inline;"> Spatial-temporal sequence forecasting (STSF) is a long-standing research problem with widespread real-world applications. Neural architecture search (NAS), which automates the neural network design, has been shown effective in tackling the STSF problem. However, the existing NAS methods for STSF focus on generating architectures in a time-consuming data-driven fashion, which heavily limits their a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17994v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17994v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17994v1-abstract-full" style="display: none;"> Spatial-temporal sequence forecasting (STSF) is a long-standing research problem with widespread real-world applications. Neural architecture search (NAS), which automates the neural network design, has been shown effective in tackling the STSF problem. However, the existing NAS methods for STSF focus on generating architectures in a time-consuming data-driven fashion, which heavily limits their ability to use background knowledge and explore the complicated search trajectory. Large language models (LLMs) have shown remarkable ability in decision-making with comprehensive internal world knowledge, but how it could benefit NAS for STSF remains unexplored. In this paper, we propose a novel NAS method for STSF based on LLM. Instead of directly generate architectures with LLM, We inspire the LLM's capability with a multi-level enhancement mechanism. Specifically, on the step-level, we decompose the generation task into decision steps with powerful prompt engineering and inspire LLM to serve as instructor for architecture search based on its internal knowledge. On the instance-level, we utilize a one-step tuning framework to quickly evaluate the architecture instance and a memory bank to cumulate knowledge to improve LLM's search ability. On the task-level, we propose a two-stage architecture search, balancing the exploration stage and optimization stage, to reduce the possibility of being trapped in local optima. Extensive experimental results demonstrate that our method can achieve competitive effectiveness with superior efficiency against existing NAS methods for STSF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17994v1-abstract-full').style.display = 'none'; document.getElementById('2503.17994v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17712">arXiv:2503.17712</a> <span> [<a href="https://arxiv.org/pdf/2503.17712">pdf</a>, <a href="https://arxiv.org/format/2503.17712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-modality Anomaly Segmentation on the Road </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Heng Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuolin He</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+S">Shoumeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+J">Jian Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17712v1-abstract-short" style="display: inline;"> Semantic segmentation allows autonomous driving cars to understand the surroundings of the vehicle comprehensively. However, it is also crucial for the model to detect obstacles that may jeopardize the safety of autonomous driving systems. Based on our experiments, we find that current uni-modal anomaly segmentation frameworks tend to produce high anomaly scores for non-anomalous regions in images… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17712v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17712v1-abstract-full" style="display: none;"> Semantic segmentation allows autonomous driving cars to understand the surroundings of the vehicle comprehensively. However, it is also crucial for the model to detect obstacles that may jeopardize the safety of autonomous driving systems. Based on our experiments, we find that current uni-modal anomaly segmentation frameworks tend to produce high anomaly scores for non-anomalous regions in images. Motivated by this empirical finding, we develop a multi-modal uncertainty-based anomaly segmentation framework, named MMRAS+, for autonomous driving systems. MMRAS+ effectively reduces the high anomaly outputs of non-anomalous classes by introducing text-modal using the CLIP text encoder. Indeed, MMRAS+ is the first multi-modal anomaly segmentation solution for autonomous driving. Moreover, we develop an ensemble module to further boost the anomaly segmentation performance. Experiments on RoadAnomaly, SMIYC, and Fishyscapes validation datasets demonstrate the superior performance of our method. The code is available in https://github.com/HengGao12/MMRAS_plus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17712v1-abstract-full').style.display = 'none'; document.getElementById('2503.17712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16266">arXiv:2503.16266</a> <span> [<a href="https://arxiv.org/pdf/2503.16266">pdf</a>, <a href="https://arxiv.org/format/2503.16266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> From Head to Tail: Efficient Black-box Model Inversion Attack via Long-tailed Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongguang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Juan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meihui Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongxin Hu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+W">Wenzhe Yi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaoyang Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mengda Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16266v1-abstract-short" style="display: inline;"> Model Inversion Attacks (MIAs) aim to reconstruct private training data from models, leading to privacy leakage, particularly in facial recognition systems. Although many studies have enhanced the effectiveness of white-box MIAs, less attention has been paid to improving efficiency and utility under limited attacker capabilities. Existing black-box MIAs necessitate an impractical number of queries… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16266v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16266v1-abstract-full" style="display: none;"> Model Inversion Attacks (MIAs) aim to reconstruct private training data from models, leading to privacy leakage, particularly in facial recognition systems. Although many studies have enhanced the effectiveness of white-box MIAs, less attention has been paid to improving efficiency and utility under limited attacker capabilities. Existing black-box MIAs necessitate an impractical number of queries, incurring significant overhead. Therefore, we analyze the limitations of existing MIAs and introduce Surrogate Model-based Inversion with Long-tailed Enhancement (SMILE), a high-resolution oriented and query-efficient MIA for the black-box setting. We begin by analyzing the initialization of MIAs from a data distribution perspective and propose a long-tailed surrogate training method to obtain high-quality initial points. We then enhance the attack's effectiveness by employing the gradient-free black-box optimization algorithm selected by NGOpt. Our experiments show that SMILE outperforms existing state-of-the-art black-box MIAs while requiring only about 5% of the query overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16266v1-abstract-full').style.display = 'none'; document.getElementById('2503.16266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15655">arXiv:2503.15655</a> <span> [<a href="https://arxiv.org/pdf/2503.15655">pdf</a>, <a href="https://arxiv.org/format/2503.15655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> R$^2$: A LLM Based Novel-to-Screenplay Generation Framework with Causal Plot Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zefeng Lin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yi Xiao</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+Z">Zhiqiang Mo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengyi Liu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xianyong Fang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaohua Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15655v1-abstract-short" style="display: inline;"> Automatically adapting novels into screenplays is important for the TV, film, or opera industries to promote products with low costs. The strong performances of large language models (LLMs) in long-text generation call us to propose a LLM based framework Reader-Rewriter (R$^2$) for this task. However, there are two fundamental challenges here. First, the LLM hallucinations may cause inconsistent p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15655v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15655v1-abstract-full" style="display: none;"> Automatically adapting novels into screenplays is important for the TV, film, or opera industries to promote products with low costs. The strong performances of large language models (LLMs) in long-text generation call us to propose a LLM based framework Reader-Rewriter (R$^2$) for this task. However, there are two fundamental challenges here. First, the LLM hallucinations may cause inconsistent plot extraction and screenplay generation. Second, the causality-embedded plot lines should be effectively extracted for coherent rewriting. Therefore, two corresponding tactics are proposed: 1) A hallucination-aware refinement method (HAR) to iteratively discover and eliminate the affections of hallucinations; and 2) a causal plot-graph construction method (CPC) based on a greedy cycle-breaking algorithm to efficiently construct plot lines with event causalities. Recruiting those efficient techniques, R$^2$ utilizes two modules to mimic the human screenplay rewriting process: The Reader module adopts a sliding window and CPC to build the causal plot graphs, while the Rewriter module generates first the scene outlines based on the graphs and then the screenplays. HAR is integrated into both modules for accurate inferences of LLMs. Experimental results demonstrate the superiority of R$^2$, which substantially outperforms three existing approaches (51.3%, 22.6%, and 57.1% absolute increases) in pairwise comparison at the overall win rate for GPT-4o. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15655v1-abstract-full').style.display = 'none'; document.getElementById('2503.15655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15369">arXiv:2503.15369</a> <span> [<a href="https://arxiv.org/pdf/2503.15369">pdf</a>, <a href="https://arxiv.org/format/2503.15369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EfficientLLaVA:Generalizable Auto-Pruning for Large Vision-language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yinan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiuwei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15369v1-abstract-short" style="display: inline;"> While multimodal large language models demonstrate strong performance in complex reasoning tasks, they pose significant challenges related to model complexity during deployment, especially for resource-limited devices. In this paper, we propose an automatic pruning method for large vision-language models to enhance the efficiency of multimodal reasoning. Conventional methods rely on the training d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15369v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15369v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15369v1-abstract-full" style="display: none;"> While multimodal large language models demonstrate strong performance in complex reasoning tasks, they pose significant challenges related to model complexity during deployment, especially for resource-limited devices. In this paper, we propose an automatic pruning method for large vision-language models to enhance the efficiency of multimodal reasoning. Conventional methods rely on the training data of the original model to select the proper pruning ratio for different network components. However, these methods are impractical for large vision-language models due to the unaffordable search costs caused by web-scale training corpus. In contrast, our approach only leverages a small number of samples to search for the desired pruning policy by maximizing its generalization ability on unknown training data while maintaining the model accuracy, which enables the achievement of an optimal trade-off between accuracy and efficiency for large visual language models. Specifically, we formulate the generalization gap of the pruning strategy using the structural risk minimization principle. Based on both task performance and generalization capability, we iteratively search for the optimal pruning policy within a given search space and optimize the vision projector to evolve the search space with higher upper bound of performance. We conduct extensive experiments on the ScienceQA, Vizwiz, MM-vet, and LLaVA-Bench datasets for the task of visual question answering. Using only 64 samples for pruning policy search, EfficientLLaVA achieves an accuracy of 83.05% on ScienceQA, along with a $\times$ 1.8 speedup compared to the dense LLaVA-v1.5-7B model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15369v1-abstract-full').style.display = 'none'; document.getElementById('2503.15369v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14910">arXiv:2503.14910</a> <span> [<a href="https://arxiv.org/pdf/2503.14910">pdf</a>, <a href="https://arxiv.org/format/2503.14910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Distribution Alignment for Industrial Anomaly Detection under Distribution Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jingyi Liao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xun Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongyi Su</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rong-Cheng Tu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xulei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14910v1-abstract-short" style="display: inline;"> Anomaly detection plays a crucial role in quality control for industrial applications. However, ensuring robustness under unseen domain shifts such as lighting variations or sensor drift remains a significant challenge. Existing methods attempt to address domain shifts by training generalizable models but often rely on prior knowledge of target distributions and can hardly generalise to backbones… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14910v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14910v1-abstract-full" style="display: none;"> Anomaly detection plays a crucial role in quality control for industrial applications. However, ensuring robustness under unseen domain shifts such as lighting variations or sensor drift remains a significant challenge. Existing methods attempt to address domain shifts by training generalizable models but often rely on prior knowledge of target distributions and can hardly generalise to backbones designed for other data modalities. To overcome these limitations, we build upon memory-bank-based anomaly detection methods, optimizing a robust Sinkhorn distance on limited target training data to enhance generalization to unseen target domains. We evaluate the effectiveness on both 2D and 3D anomaly detection benchmarks with simulated distribution shifts. Our proposed method demonstrates superior results compared with state-of-the-art anomaly detection and domain adaptation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14910v1-abstract-full').style.display = 'none'; document.getElementById('2503.14910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14908">arXiv:2503.14908</a> <span> [<a href="https://arxiv.org/pdf/2503.14908">pdf</a>, <a href="https://arxiv.org/format/2503.14908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> POSTA: A Go-to Framework for Customized Artistic Poster Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaojie Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jingjing Ren</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+T">Tian Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songhua Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14908v1-abstract-short" style="display: inline;"> Poster design is a critical medium for visual communication. Prior work has explored automatic poster design using deep learning techniques, but these approaches lack text accuracy, user customization, and aesthetic appeal, limiting their applicability in artistic domains such as movies and exhibitions, where both clear content delivery and visual impact are essential. To address these limitations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14908v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14908v1-abstract-full" style="display: none;"> Poster design is a critical medium for visual communication. Prior work has explored automatic poster design using deep learning techniques, but these approaches lack text accuracy, user customization, and aesthetic appeal, limiting their applicability in artistic domains such as movies and exhibitions, where both clear content delivery and visual impact are essential. To address these limitations, we present POSTA: a modular framework powered by diffusion models and multimodal large language models (MLLMs) for customized artistic poster generation. The framework consists of three modules. Background Diffusion creates a themed background based on user input. Design MLLM then generates layout and typography elements that align with and complement the background style. Finally, to enhance the poster's aesthetic appeal, ArtText Diffusion applies additional stylization to key text elements. The final result is a visually cohesive and appealing poster, with a fully modular process that allows for complete customization. To train our models, we develop the PosterArt dataset, comprising high-quality artistic posters annotated with layout, typography, and pixel-level stylized text segmentation. Our comprehensive experimental analysis demonstrates POSTA's exceptional controllability and design diversity, outperforming existing models in both text accuracy and aesthetic quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14908v1-abstract-full').style.display = 'none'; document.getElementById('2503.14908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14097">arXiv:2503.14097</a> <span> [<a href="https://arxiv.org/pdf/2503.14097">pdf</a>, <a href="https://arxiv.org/format/2503.14097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SCJD: Sparse Correlation and Joint Distillation for Efficient 3D Human Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weihong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuemiao Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoxin Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yi Xie</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Peng Xiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Cheng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Heng%2C+P">Pheng-Ann Heng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14097v1-abstract-short" style="display: inline;"> Existing 3D Human Pose Estimation (HPE) methods achieve high accuracy but suffer from computational overhead and slow inference, while knowledge distillation methods fail to address spatial relationships between joints and temporal correlations in multi-frame inputs. In this paper, we propose Sparse Correlation and Joint Distillation (SCJD), a novel framework that balances efficiency and accuracy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14097v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14097v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14097v1-abstract-full" style="display: none;"> Existing 3D Human Pose Estimation (HPE) methods achieve high accuracy but suffer from computational overhead and slow inference, while knowledge distillation methods fail to address spatial relationships between joints and temporal correlations in multi-frame inputs. In this paper, we propose Sparse Correlation and Joint Distillation (SCJD), a novel framework that balances efficiency and accuracy for 3D HPE. SCJD introduces Sparse Correlation Input Sequence Downsampling to reduce redundancy in student network inputs while preserving inter-frame correlations. For effective knowledge transfer, we propose Dynamic Joint Spatial Attention Distillation, which includes Dynamic Joint Embedding Distillation to enhance the student's feature representation using the teacher's multi-frame context feature, and Adjacent Joint Attention Distillation to improve the student network's focus on adjacent joint relationships for better spatial understanding. Additionally, Temporal Consistency Distillation aligns the temporal correlations between teacher and student networks through upsampling and global supervision. Extensive experiments demonstrate that SCJD achieves state-of-the-art performance. Code is available at https://github.com/wileychan/SCJD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14097v1-abstract-full').style.display = 'none'; document.getElementById('2503.14097v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13446">arXiv:2503.13446</a> <span> [<a href="https://arxiv.org/pdf/2503.13446">pdf</a>, <a href="https://arxiv.org/format/2503.13446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoManipVLA: Transferring Vision-language-action Models for General Mobile Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenyu Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiuwei Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haibin Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13446v1-abstract-short" style="display: inline;"> Mobile manipulation is the fundamental challenge for robotics to assist humans with diverse tasks and environments in everyday life. However, conventional mobile manipulation approaches often struggle to generalize across different tasks and environments because of the lack of large-scale training. In contrast, recent advances in vision-language-action (VLA) models have shown impressive generaliza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13446v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13446v1-abstract-full" style="display: none;"> Mobile manipulation is the fundamental challenge for robotics to assist humans with diverse tasks and environments in everyday life. However, conventional mobile manipulation approaches often struggle to generalize across different tasks and environments because of the lack of large-scale training. In contrast, recent advances in vision-language-action (VLA) models have shown impressive generalization capabilities, but these foundation models are developed for fixed-base manipulation tasks. Therefore, we propose an efficient policy adaptation framework named MoManipVLA to transfer pre-trained VLA models of fix-base manipulation to mobile manipulation, so that high generalization ability across tasks and environments can be achieved in mobile manipulation policy. Specifically, we utilize pre-trained VLA models to generate waypoints of the end-effector with high generalization ability. We design motion planning objectives for the mobile base and the robot arm, which aim at maximizing the physical feasibility of the trajectory. Finally, we present an efficient bi-level objective optimization framework for trajectory generation, where the upper-level optimization predicts waypoints for base movement to enhance the manipulator policy space, and the lower-level optimization selects the optimal end-effector trajectory to complete the manipulation task. In this way, MoManipVLA can adjust the position of the robot base in a zero-shot manner, thus making the waypoints predicted from the fixed-base VLA models feasible. Extensive experimental results on OVMM and the real world demonstrate that MoManipVLA achieves a 4.2% higher success rate than the state-of-the-art mobile manipulation, and only requires 50 training cost for real world deployment due to the strong generalization ability in the pre-trained VLA models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13446v1-abstract-full').style.display = 'none'; document.getElementById('2503.13446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025. Project Page: https://gary3410.github.io/momanipVLA/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13424">arXiv:2503.13424</a> <span> [<a href="https://arxiv.org/pdf/2503.13424">pdf</a>, <a href="https://arxiv.org/format/2503.13424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Infinite Mobility: Scalable High-Fidelity Synthesis of Articulated Objects via Procedural Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xinyu Lian</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zichao Yu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruiming Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yitong Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L+R">Li Ray Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kaixu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanzhen Zhou</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Q">Qihong Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xudong Xu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Z">Zhaoyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+B">Bo Dai</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jiangmiao Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13424v1-abstract-short" style="display: inline;"> Large-scale articulated objects with high quality are desperately needed for multiple tasks related to embodied AI. Most existing methods for creating articulated objects are either data-driven or simulation based, which are limited by the scale and quality of the training data or the fidelity and heavy labour of the simulation. In this paper, we propose Infinite Mobility, a novel method for synth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13424v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13424v1-abstract-full" style="display: none;"> Large-scale articulated objects with high quality are desperately needed for multiple tasks related to embodied AI. Most existing methods for creating articulated objects are either data-driven or simulation based, which are limited by the scale and quality of the training data or the fidelity and heavy labour of the simulation. In this paper, we propose Infinite Mobility, a novel method for synthesizing high-fidelity articulated objects through procedural generation. User study and quantitative evaluation demonstrate that our method can produce results that excel current state-of-the-art methods and are comparable to human-annotated datasets in both physics property and mesh quality. Furthermore, we show that our synthetic data can be used as training data for generative models, enabling next-step scaling up. Code is available at https://github.com/Intern-Nexus/Infinite-Mobility <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13424v1-abstract-full').style.display = 'none'; document.getElementById('2503.13424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://infinite-mobility.github.io 10 pages,12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13224">arXiv:2503.13224</a> <span> [<a href="https://arxiv.org/pdf/2503.13224">pdf</a>, <a href="https://arxiv.org/format/2503.13224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ProDiF: Protecting Domain-Invariant Features to Secure Pre-Trained Models Against Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tong Zhou</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+S">Shijin Duan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Fleming%2C+C">Charles Fleming</a>, <a href="/search/cs?searchtype=author&query=Kompella%2C+R+R">Ramana Rao Kompella</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Shaolei Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13224v1-abstract-short" style="display: inline;"> Pre-trained models are valuable intellectual property, capturing both domain-specific and domain-invariant features within their weight spaces. However, model extraction attacks threaten these assets by enabling unauthorized source-domain inference and facilitating cross-domain transfer via the exploitation of domain-invariant features. In this work, we introduce **ProDiF**, a novel framework that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13224v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13224v1-abstract-full" style="display: none;"> Pre-trained models are valuable intellectual property, capturing both domain-specific and domain-invariant features within their weight spaces. However, model extraction attacks threaten these assets by enabling unauthorized source-domain inference and facilitating cross-domain transfer via the exploitation of domain-invariant features. In this work, we introduce **ProDiF**, a novel framework that leverages targeted weight space manipulation to secure pre-trained models against extraction attacks. **ProDiF** quantifies the transferability of filters and perturbs the weights of critical filters in unsecured memory, while preserving actual critical weights in a Trusted Execution Environment (TEE) for authorized users. A bi-level optimization further ensures resilience against adaptive fine-tuning attacks. Experimental results show that **ProDiF** reduces source-domain accuracy to near-random levels and decreases cross-domain transferability by 74.65\%, providing robust protection for pre-trained models. This work offers comprehensive protection for pre-trained DNN models and highlights the potential of weight space manipulation as a novel approach to model security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13224v1-abstract-full').style.display = 'none'; document.getElementById('2503.13224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the ICLR Workshop on Neural Network Weights as a New Data Modality 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12873">arXiv:2503.12873</a> <span> [<a href="https://arxiv.org/pdf/2503.12873">pdf</a>, <a href="https://arxiv.org/format/2503.12873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> SeeAction: Towards Reverse Engineering How-What-Where of HCI Actions from Screencasts for UI Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dehai Zhao</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Z">Zhenchang Xing</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qinghua Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiwei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Liming Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12873v1-abstract-short" style="display: inline;"> UI automation is a useful technique for UI testing, bug reproduction, and robotic process automation. Recording user actions with an application assists rapid development of UI automation scripts, but existing recording techniques are intrusive, rely on OS or GUI framework accessibility support, or assume specific app implementations. Reverse engineering user actions from screencasts is non-intrus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12873v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12873v1-abstract-full" style="display: none;"> UI automation is a useful technique for UI testing, bug reproduction, and robotic process automation. Recording user actions with an application assists rapid development of UI automation scripts, but existing recording techniques are intrusive, rely on OS or GUI framework accessibility support, or assume specific app implementations. Reverse engineering user actions from screencasts is non-intrusive, but a key reverse-engineering step is currently missing - recognizing human-understandable structured user actions ([command] [widget] [location]) from action screencasts. To fill the gap, we propose a deep learning-based computer vision model that can recognize 11 commands and 11 widgets, and generate location phrases from action screencasts, through joint learning and multi-task learning. We label a large dataset with 7260 video-action pairs, which record user interactions with Word, Zoom, Firefox, Photoshop, and Windows 10 Settings. Through extensive experiments, we confirm the effectiveness and generality of our model, and demonstrate the usefulness of a screencast-to-action-script tool built upon our model for bug reproduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12873v1-abstract-full').style.display = 'none'; document.getElementById('2503.12873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE/ACM International Conference on Software Engineering 2025 (ICSE 2025, Distinguished paper award)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICSE 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12533">arXiv:2503.12533</a> <span> [<a href="https://arxiv.org/pdf/2503.12533">pdf</a>, <a href="https://arxiv.org/format/2503.12533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Being-0: A Humanoid Robotic Agent with Vision-Language Models and Modular Skills </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haoqi Yuan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yu Bai</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yuhui Fu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bohan Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yicheng Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinrun Xu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yi Zhan</a>, <a href="/search/cs?searchtype=author&query=Karlsson%2C+B+F">B枚rje F. Karlsson</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12533v1-abstract-short" style="display: inline;"> Building autonomous robotic agents capable of achieving human-level performance in real-world embodied tasks is an ultimate goal in humanoid robot research. Recent advances have made significant progress in high-level cognition with Foundation Models (FMs) and low-level skill development for humanoid robots. However, directly combining these components often results in poor robustness and efficien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12533v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12533v1-abstract-full" style="display: none;"> Building autonomous robotic agents capable of achieving human-level performance in real-world embodied tasks is an ultimate goal in humanoid robot research. Recent advances have made significant progress in high-level cognition with Foundation Models (FMs) and low-level skill development for humanoid robots. However, directly combining these components often results in poor robustness and efficiency due to compounding errors in long-horizon tasks and the varied latency of different modules. We introduce Being-0, a hierarchical agent framework that integrates an FM with a modular skill library. The FM handles high-level cognitive tasks such as instruction understanding, task planning, and reasoning, while the skill library provides stable locomotion and dexterous manipulation for low-level control. To bridge the gap between these levels, we propose a novel Connector module, powered by a lightweight vision-language model (VLM). The Connector enhances the FM's embodied capabilities by translating language-based plans into actionable skill commands and dynamically coordinating locomotion and manipulation to improve task success. With all components, except the FM, deployable on low-cost onboard computation devices, Being-0 achieves efficient, real-time performance on a full-sized humanoid robot equipped with dexterous hands and active vision. Extensive experiments in large indoor environments demonstrate Being-0's effectiveness in solving complex, long-horizon tasks that require challenging navigation and manipulation subtasks. For further details and videos, visit https://beingbeyond.github.io/being-0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12533v1-abstract-full').style.display = 'none'; document.getElementById('2503.12533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12228">arXiv:2503.12228</a> <span> [<a href="https://arxiv.org/pdf/2503.12228">pdf</a>, <a href="https://arxiv.org/format/2503.12228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Fault Tolerance Mechanisms of Large Language Models in Cloud Computing Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yihong Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ze Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinhe Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shuyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12228v1-abstract-short" style="display: inline;"> With the rapid evolution of Large Language Models (LLMs) and their large-scale experimentation in cloud-computing spaces, the challenge of guaranteeing their security and efficiency in a failure scenario has become a main issue. To ensure the reliability and availability of large-scale language models in cloud computing scenarios, such as frequent resource failures, network problems, and computati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12228v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12228v1-abstract-full" style="display: none;"> With the rapid evolution of Large Language Models (LLMs) and their large-scale experimentation in cloud-computing spaces, the challenge of guaranteeing their security and efficiency in a failure scenario has become a main issue. To ensure the reliability and availability of large-scale language models in cloud computing scenarios, such as frequent resource failures, network problems, and computational overheads, this study proposes a novel adaptive fault tolerance mechanism. It builds upon known fault-tolerant mechanisms, such as checkpointing, redundancy, and state transposition, introducing dynamic resource allocation and prediction of failure based on real-time performance metrics. The hybrid model integrates data driven deep learning-based anomaly detection technique underlining the contribution of cloud orchestration middleware for predictive prevention of system failures. Additionally, the model integrates adaptive checkpointing and recovery strategies that dynamically adapt according to load and system state to minimize the influence on the performance of the model and minimize downtime. The experimental results demonstrate that the designed model considerably enhances the fault tolerance in large-scale cloud surroundings, and decreases the system downtime by $\mathbf{30\%}$, and has a better modeling availability than the classical fault tolerance mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12228v1-abstract-full').style.display = 'none'; document.getElementById('2503.12228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE ICCEA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12226">arXiv:2503.12226</a> <span> [<a href="https://arxiv.org/pdf/2503.12226">pdf</a>, <a href="https://arxiv.org/format/2503.12226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Research on Large Language Model Cross-Cloud Privacy Protection and Collaborative Training based on Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ze Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yihong Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Juntian Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12226v1-abstract-short" style="display: inline;"> The fast development of large language models (LLMs) and popularization of cloud computing have led to increasing concerns on privacy safeguarding and data security of cross-cloud model deployment and training as the key challenges. We present a new framework for addressing these issues along with enabling privacy preserving collaboration on training between distributed clouds based on federated l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12226v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12226v1-abstract-full" style="display: none;"> The fast development of large language models (LLMs) and popularization of cloud computing have led to increasing concerns on privacy safeguarding and data security of cross-cloud model deployment and training as the key challenges. We present a new framework for addressing these issues along with enabling privacy preserving collaboration on training between distributed clouds based on federated learning. Our mechanism encompasses cutting-edge cryptographic primitives, dynamic model aggregation techniques, and cross-cloud data harmonization solutions to enhance security, efficiency, and scalability to the traditional federated learning paradigm. Furthermore, we proposed a hybrid aggregation scheme to mitigate the threat of Data Leakage and to optimize the aggregation of model updates, thus achieving substantial enhancement on the model effectiveness and stability. Experimental results demonstrate that the training efficiency, privacy protection, and model accuracy of the proposed model compare favorably to those of the traditional federated learning method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12226v1-abstract-full').style.display = 'none'; document.getElementById('2503.12226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE AINIT 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11742">arXiv:2503.11742</a> <span> [<a href="https://arxiv.org/pdf/2503.11742">pdf</a>, <a href="https://arxiv.org/format/2503.11742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Safe Vision-Language Models via Unsafe Weights Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=D%27Inc%C3%A0%2C+M">Moreno D'Inc脿</a>, <a href="/search/cs?searchtype=author&query=Peruzzo%2C+E">Elia Peruzzo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xingqian Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Humphrey Shi</a>, <a href="/search/cs?searchtype=author&query=Sebe%2C+N">Nicu Sebe</a>, <a href="/search/cs?searchtype=author&query=Mancini%2C+M">Massimiliano Mancini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11742v1-abstract-short" style="display: inline;"> Vision-language models (VLMs) often inherit the biases and unsafe associations present within their large-scale training dataset. While recent approaches mitigate unsafe behaviors, their evaluation focuses on how safe the model is on unsafe inputs, ignoring potential shortcomings on safe ones. In this paper, we first revise safety evaluation by introducing SafeGround, a new set of metrics that eva… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11742v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11742v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11742v1-abstract-full" style="display: none;"> Vision-language models (VLMs) often inherit the biases and unsafe associations present within their large-scale training dataset. While recent approaches mitigate unsafe behaviors, their evaluation focuses on how safe the model is on unsafe inputs, ignoring potential shortcomings on safe ones. In this paper, we first revise safety evaluation by introducing SafeGround, a new set of metrics that evaluate safety at different levels of granularity. With this metric, we uncover a surprising issue of training-based methods: they make the model less safe on safe inputs. From this finding, we take a different direction and explore whether it is possible to make a model safer without training, introducing Unsafe Weights Manipulation (UWM). UWM uses a calibration set of safe and unsafe instances to compare activations between safe and unsafe content, identifying the most important parameters for processing the latter. Their values are then manipulated via negation. Experiments show that UWM achieves the best tradeoff between safety and knowledge preservation, consistently improving VLMs on unsafe queries while outperforming even training-based state-of-the-art methods on safe ones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11742v1-abstract-full').style.display = 'none'; document.getElementById('2503.11742v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11088">arXiv:2503.11088</a> <span> [<a href="https://arxiv.org/pdf/2503.11088">pdf</a>, <a href="https://arxiv.org/format/2503.11088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Industrial Anomaly Detection with Epipolar Constrained Cross-View Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xun Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijie Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jingyi Liao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xulei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11088v1-abstract-short" style="display: inline;"> Multi-camera systems provide richer contextual information for industrial anomaly detection. However, traditional methods process each view independently, disregarding the complementary information across viewpoints. Existing multi-view anomaly detection approaches typically employ data-driven cross-view attention for feature fusion but fail to leverage the unique geometric properties of multi-cam… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11088v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11088v1-abstract-full" style="display: none;"> Multi-camera systems provide richer contextual information for industrial anomaly detection. However, traditional methods process each view independently, disregarding the complementary information across viewpoints. Existing multi-view anomaly detection approaches typically employ data-driven cross-view attention for feature fusion but fail to leverage the unique geometric properties of multi-camera setups. In this work, we introduce an epipolar geometry-constrained attention module to guide cross-view fusion, ensuring more effective information aggregation. To further enhance the potential of cross-view attention, we propose a pretraining strategy inspired by memory bank-based anomaly detection. This approach encourages normal feature representations to form multiple local clusters and incorporate multi-view aware negative sample synthesis to regularize pretraining. We demonstrate that our epipolar guided multi-view anomaly detection framework outperforms existing methods on the state-of-the-art multi-view anomaly detection dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11088v1-abstract-full').style.display = 'none'; document.getElementById('2503.11088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10840">arXiv:2503.10840</a> <span> [<a href="https://arxiv.org/pdf/2503.10840">pdf</a>, <a href="https://arxiv.org/format/2503.10840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Efficient Reachability Analysis for Convolutional Neural Networks Using Hybrid Zonotopes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiangru Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10840v1-abstract-short" style="display: inline;"> Feedforward neural networks are widely used in autonomous systems, particularly for control and perception tasks within the system loop. However, their vulnerability to adversarial attacks necessitates formal verification before deployment in safety-critical applications. Existing set propagation-based reachability analysis methods for feedforward neural networks often struggle to achieve both sca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10840v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10840v1-abstract-full" style="display: none;"> Feedforward neural networks are widely used in autonomous systems, particularly for control and perception tasks within the system loop. However, their vulnerability to adversarial attacks necessitates formal verification before deployment in safety-critical applications. Existing set propagation-based reachability analysis methods for feedforward neural networks often struggle to achieve both scalability and accuracy. This work presents a novel set-based approach for computing the reachable sets of convolutional neural networks. The proposed method leverages a hybrid zonotope representation and an efficient neural network reduction technique, providing a flexible trade-off between computational complexity and approximation accuracy. Numerical examples are presented to demonstrate the effectiveness of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10840v1-abstract-full').style.display = 'none'; document.getElementById('2503.10840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2025 American Control Conference (ACC). 8 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10719">arXiv:2503.10719</a> <span> [<a href="https://arxiv.org/pdf/2503.10719">pdf</a>, <a href="https://arxiv.org/format/2503.10719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long-Video Audio Synthesis with Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yehang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinli Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaojie Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingcong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10719v2-abstract-short" style="display: inline;"> Video-to-audio synthesis, which generates synchronized audio for visual content, critically enhances viewer immersion and narrative coherence in film and interactive media. However, video-to-audio dubbing for long-form content remains an unsolved challenge due to dynamic semantic shifts, temporal misalignment, and the absence of dedicated datasets. While existing methods excel in short videos, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10719v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10719v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10719v2-abstract-full" style="display: none;"> Video-to-audio synthesis, which generates synchronized audio for visual content, critically enhances viewer immersion and narrative coherence in film and interactive media. However, video-to-audio dubbing for long-form content remains an unsolved challenge due to dynamic semantic shifts, temporal misalignment, and the absence of dedicated datasets. While existing methods excel in short videos, they falter in long scenarios (e.g., movies) due to fragmented synthesis and inadequate cross-scene consistency. We propose LVAS-Agent, a novel multi-agent framework that emulates professional dubbing workflows through collaborative role specialization. Our approach decomposes long-video synthesis into four steps including scene segmentation, script generation, sound design and audio synthesis. Central innovations include a discussion-correction mechanism for scene/script refinement and a generation-retrieval loop for temporal-semantic alignment. To enable systematic evaluation, we introduce LVAS-Bench, the first benchmark with 207 professionally curated long videos spanning diverse scenarios. Experiments demonstrate superior audio-visual alignment over baseline methods. Project page: https://lvas-agent.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10719v2-abstract-full').style.display = 'none'; document.getElementById('2503.10719v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10630">arXiv:2503.10630</a> <span> [<a href="https://arxiv.org/pdf/2503.10630">pdf</a>, <a href="https://arxiv.org/format/2503.10630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> UniGoal: Towards Universal Zero-shot Goal-oriented Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hang Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiuwei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingqing Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10630v3-abstract-short" style="display: inline;"> In this paper, we propose a general framework for universal zero-shot goal-oriented navigation. Existing zero-shot methods build inference framework upon large language models (LLM) for specific tasks, which differs a lot in overall pipeline and fails to generalize across different types of goal. Towards the aim of universal zero-shot navigation, we propose a uniform graph representation to unify… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10630v3-abstract-full').style.display = 'inline'; document.getElementById('2503.10630v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10630v3-abstract-full" style="display: none;"> In this paper, we propose a general framework for universal zero-shot goal-oriented navigation. Existing zero-shot methods build inference framework upon large language models (LLM) for specific tasks, which differs a lot in overall pipeline and fails to generalize across different types of goal. Towards the aim of universal zero-shot navigation, we propose a uniform graph representation to unify different goals, including object category, instance image and text description. We also convert the observation of agent into an online maintained scene graph. With this consistent scene and goal representation, we preserve most structural information compared with pure text and are able to leverage LLM for explicit graph-based reasoning. Specifically, we conduct graph matching between the scene graph and goal graph at each time instant and propose different strategies to generate long-term goal of exploration according to different matching states. The agent first iteratively searches subgraph of goal when zero-matched. With partial matching, the agent then utilizes coordinate projection and anchor pair alignment to infer the goal location. Finally scene graph correction and goal verification are applied for perfect matching. We also present a blacklist mechanism to enable robust switch between stages. Extensive experiments on several benchmarks show that our UniGoal achieves state-of-the-art zero-shot performance on three studied navigation tasks with a single model, even outperforming task-specific zero-shot methods and supervised universal methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10630v3-abstract-full').style.display = 'none'; document.getElementById('2503.10630v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025. Project page: https://bagh2178.github.io/UniGoal/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10109">arXiv:2503.10109</a> <span> [<a href="https://arxiv.org/pdf/2503.10109">pdf</a>, <a href="https://arxiv.org/format/2503.10109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dream-IF: Dynamic Relative EnhAnceMent for Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xingxin Xu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bing Cao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinan Xia</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pengfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10109v1-abstract-short" style="display: inline;"> Image fusion aims to integrate comprehensive information from images acquired through multiple sources. However, images captured by diverse sensors often encounter various degradations that can negatively affect fusion quality. Traditional fusion methods generally treat image enhancement and fusion as separate processes, overlooking the inherent correlation between them; notably, the dominant regi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10109v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10109v1-abstract-full" style="display: none;"> Image fusion aims to integrate comprehensive information from images acquired through multiple sources. However, images captured by diverse sensors often encounter various degradations that can negatively affect fusion quality. Traditional fusion methods generally treat image enhancement and fusion as separate processes, overlooking the inherent correlation between them; notably, the dominant regions in one modality of a fused image often indicate areas where the other modality might benefit from enhancement. Inspired by this observation, we introduce the concept of dominant regions for image enhancement and present a Dynamic Relative EnhAnceMent framework for Image Fusion (Dream-IF). This framework quantifies the relative dominance of each modality across different layers and leverages this information to facilitate reciprocal cross-modal enhancement. By integrating the relative dominance derived from image fusion, our approach supports not only image restoration but also a broader range of image enhancement applications. Furthermore, we employ prompt-based encoding to capture degradation-specific details, which dynamically steer the restoration process and promote coordinated enhancement in both multi-modal image fusion and image enhancement scenarios. Extensive experimental results demonstrate that Dream-IF consistently outperforms its counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10109v1-abstract-full').style.display = 'none'; document.getElementById('2503.10109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09962">arXiv:2503.09962</a> <span> [<a href="https://arxiv.org/pdf/2503.09962">pdf</a>, <a href="https://arxiv.org/format/2503.09962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modeling Thousands of Human Annotators for Generalizable Text-to-Image Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiayu Jiang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Changxing Ding</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wentao Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junhong Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jin Tao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiangmin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09962v1-abstract-short" style="display: inline;"> Text-to-image person re-identification (ReID) aims to retrieve the images of an interested person based on textual descriptions. One main challenge for this task is the high cost in manually annotating large-scale databases, which affects the generalization ability of ReID models. Recent works handle this problem by leveraging Multi-modal Large Language Models (MLLMs) to describe pedestrian images… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09962v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09962v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09962v1-abstract-full" style="display: none;"> Text-to-image person re-identification (ReID) aims to retrieve the images of an interested person based on textual descriptions. One main challenge for this task is the high cost in manually annotating large-scale databases, which affects the generalization ability of ReID models. Recent works handle this problem by leveraging Multi-modal Large Language Models (MLLMs) to describe pedestrian images automatically. However, the captions produced by MLLMs lack diversity in description styles. To address this issue, we propose a Human Annotator Modeling (HAM) approach to enable MLLMs to mimic the description styles of thousands of human annotators. Specifically, we first extract style features from human textual descriptions and perform clustering on them. This allows us to group textual descriptions with similar styles into the same cluster. Then, we employ a prompt to represent each of these clusters and apply prompt learning to mimic the description styles of different human annotators. Furthermore, we define a style feature space and perform uniform sampling in this space to obtain more diverse clustering prototypes, which further enriches the diversity of the MLLM-generated captions. Finally, we adopt HAM to automatically annotate a massive-scale database for text-to-image ReID. Extensive experiments on this database demonstrate that it significantly improves the generalization ability of ReID models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09962v1-abstract-full').style.display = 'none'; document.getElementById('2503.09962v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025. Project website: https://github.com/sssaury/HAM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09257">arXiv:2503.09257</a> <span> [<a href="https://arxiv.org/pdf/2503.09257">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> DeepInnovation AI: A Global Dataset Mapping the AI innovation from Academic Research to Industrial Patents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+H">Haixing Gong</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Hui Zou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xingzhou Liang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+S">Shiyuan Meng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pinlong Cai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xingcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+J">Jingjing Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09257v4-abstract-short" style="display: inline;"> In the rapidly evolving field of artificial intelligence (AI), mapping innovation patterns and understanding effective technology transfer from research to applications are essential for economic growth. However, existing data infrastructures suffer from fragmentation, incomplete coverage, and insufficient evaluative capacity. Here, we present DeepInnovationAI, a comprehensive global dataset conta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09257v4-abstract-full').style.display = 'inline'; document.getElementById('2503.09257v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09257v4-abstract-full" style="display: none;"> In the rapidly evolving field of artificial intelligence (AI), mapping innovation patterns and understanding effective technology transfer from research to applications are essential for economic growth. However, existing data infrastructures suffer from fragmentation, incomplete coverage, and insufficient evaluative capacity. Here, we present DeepInnovationAI, a comprehensive global dataset containing three structured files. DeepPatentAI.csv: Contains 2,356,204 patent records with 8 field-specific attributes. DeepDiveAI.csv: Encompasses 3,511,929 academic publications with 13 metadata fields. These two datasets leverage large language models, multilingual text analysis and dual-layer BERT classifiers to accurately identify AI-related content, while utilizing hypergraph analysis to create robust innovation metrics. Additionally, DeepCosineAI.csv: By applying semantic vector proximity analysis, this file presents approximately one hundred million calculated paper-patent similarity pairs to enhance understanding of how theoretical advancements translate into commercial technologies. DeepInnovationAI enables researchers, policymakers, and industry leaders to anticipate trends and identify collaboration opportunities. With extensive temporal and geographical scope, it supports detailed analysis of technological development patterns and international competition dynamics, establishing a foundation for modeling AI innovation and technology transfer processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09257v4-abstract-full').style.display = 'none'; document.getElementById('2503.09257v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages and 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08588">arXiv:2503.08588</a> <span> [<a href="https://arxiv.org/pdf/2503.08588">pdf</a>, <a href="https://arxiv.org/format/2503.08588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BiasEdit: Debiasing Stereotyped Language Models via Model Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08588v1-abstract-short" style="display: inline;"> Previous studies have established that language models manifest stereotyped biases. Existing debiasing strategies, such as retraining a model with counterfactual data, representation projection, and prompting often fail to efficiently eliminate bias or directly alter the models' biased internal representations. To address these issues, we propose BiasEdit, an efficient model editing method to remo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08588v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08588v1-abstract-full" style="display: none;"> Previous studies have established that language models manifest stereotyped biases. Existing debiasing strategies, such as retraining a model with counterfactual data, representation projection, and prompting often fail to efficiently eliminate bias or directly alter the models' biased internal representations. To address these issues, we propose BiasEdit, an efficient model editing method to remove stereotypical bias from language models through lightweight networks that act as editors to generate parameter updates. BiasEdit employs a debiasing loss guiding editor networks to conduct local edits on partial parameters of a language model for debiasing while preserving the language modeling abilities during editing through a retention loss. Experiments on StereoSet and Crows-Pairs demonstrate the effectiveness, efficiency, and robustness of BiasEdit in eliminating bias compared to tangental debiasing baselines and little to no impact on the language models' general capabilities. In addition, we conduct bias tracing to probe bias in various modules and explore bias editing impacts on different components of language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08588v1-abstract-full').style.display = 'none'; document.getElementById('2503.08588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TrustNLP @ NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08354">arXiv:2503.08354</a> <span> [<a href="https://arxiv.org/pdf/2503.08354">pdf</a>, <a href="https://arxiv.org/format/2503.08354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robust Latent Matters: Boosting Image Generation with Sampling Error Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+K">Kai Qiu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Kuen%2C+J">Jason Kuen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaohao Xu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yinyi Luo</a>, <a href="/search/cs?searchtype=author&query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Savvides%2C+M">Marios Savvides</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08354v2-abstract-short" style="display: inline;"> Recent image generation schemes typically capture image distribution in a pre-constructed latent space relying on a frozen image tokenizer. Though the performance of tokenizer plays an essential role to the successful generation, its current evaluation metrics (e.g. rFID) fail to precisely assess the tokenizer and correlate its performance to the generation quality (e.g. gFID). In this paper, we c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08354v2-abstract-full').style.display = 'inline'; document.getElementById('2503.08354v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08354v2-abstract-full" style="display: none;"> Recent image generation schemes typically capture image distribution in a pre-constructed latent space relying on a frozen image tokenizer. Though the performance of tokenizer plays an essential role to the successful generation, its current evaluation metrics (e.g. rFID) fail to precisely assess the tokenizer and correlate its performance to the generation quality (e.g. gFID). In this paper, we comprehensively analyze the reason for the discrepancy of reconstruction and generation qualities in a discrete latent space, and, from which, we propose a novel plug-and-play tokenizer training scheme to facilitate latent space construction. Specifically, a latent perturbation approach is proposed to simulate sampling noises, i.e., the unexpected tokens sampled, from the generative process. With the latent perturbation, we further propose (1) a novel tokenizer evaluation metric, i.e., pFID, which successfully correlates the tokenizer performance to generation quality and (2) a plug-and-play tokenizer training scheme, which significantly enhances the robustness of tokenizer thus boosting the generation quality and convergence speed. Extensive benchmarking are conducted with 11 advanced discrete image tokenizers with 2 autoregressive generation models to validate our approach. The tokenizer trained with our proposed latent perturbation achieve a notable 1.60 gFID with classifier-free guidance (CFG) and 3.45 gFID without CFG with a $\sim$400M generator. Code: https://github.com/lxa9867/ImageFolder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08354v2-abstract-full').style.display = 'none'; document.getElementById('2503.08354v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 13 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08079">arXiv:2503.08079</a> <span> [<a href="https://arxiv.org/pdf/2503.08079">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Advancing Sentiment Analysis: A Novel LSTM Framework with Multi-head Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jingyuan Yi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Peiyang Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tianyi Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaochuan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08079v1-abstract-short" style="display: inline;"> This work proposes an LSTM-based sentiment classification model with multi-head attention mechanism and TF-IDF optimization. Through the integration of TF-IDF feature extraction and multi-head attention, the model significantly improves text sentiment analysis performance. Experimental results on public data sets demonstrate that the new method achieves substantial improvements in the most critica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08079v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08079v1-abstract-full" style="display: none;"> This work proposes an LSTM-based sentiment classification model with multi-head attention mechanism and TF-IDF optimization. Through the integration of TF-IDF feature extraction and multi-head attention, the model significantly improves text sentiment analysis performance. Experimental results on public data sets demonstrate that the new method achieves substantial improvements in the most critical metrics like accuracy, recall, and F1-score compared to baseline models. Specifically, the model achieves an accuracy of 80.28% on the test set, which is improved by about 12% in comparison with standard LSTM models. Ablation experiments also support the necessity and necessity of all modules, in which the impact of multi-head attention is greatest to performance improvement. This research provides a proper approach to sentiment analysis, which can be utilized in public opinion monitoring, product recommendation, etc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08079v1-abstract-full').style.display = 'none'; document.getElementById('2503.08079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08062">arXiv:2503.08062</a> <span> [<a href="https://arxiv.org/pdf/2503.08062">pdf</a>, <a href="https://arxiv.org/format/2503.08062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> How Does CP Length Affect the Sensing Range for OFDM-ISAC? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaoli Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhiwen Zhou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yong Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08062v1-abstract-short" style="display: inline;"> Orthogonal frequency division multiplexing (OFDM), which has been the dominating waveform for contemporary wireless communications, is also regarded as a competitive candidate for future integrated sensing and communication (ISAC) systems. Existing works on OFDM-ISAC usually assume that the maximum sensing range should be limited by the cyclic prefix (CP) length since inter-symbol interference (IS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08062v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08062v1-abstract-full" style="display: none;"> Orthogonal frequency division multiplexing (OFDM), which has been the dominating waveform for contemporary wireless communications, is also regarded as a competitive candidate for future integrated sensing and communication (ISAC) systems. Existing works on OFDM-ISAC usually assume that the maximum sensing range should be limited by the cyclic prefix (CP) length since inter-symbol interference (ISI) and inter-carrier interference (ICI) should be avoided. However, in this paper, we provide rigorous analysis to reveal that the random data embedded in OFDM-ISAC signal can actually act as a free ``mask" for ISI, which makes ISI/ICI random and hence greatly attenuated after radar signal processing. The derived signal-to-interference-plus-noise ratio (SINR) in the range profile demonstrates that the maximum sensing range of OFDM-ISAC can greatly exceed the ISI-free distance that is limited by the CP length, which is validated by simulation results. To further mitigate power degradation for long-range targets, a novel sliding window sensing method is proposed, which iteratively detects and cancels short-range targets before shifting the detection window. The shifted detection window can effectively compensate the power degradation due to insufficient CP length for long-range targets. Such results provide valuable guidance for the CP length design in OFDM-ISAC systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08062v1-abstract-full').style.display = 'none'; document.getElementById('2503.08062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08035">arXiv:2503.08035</a> <span> [<a href="https://arxiv.org/pdf/2503.08035">pdf</a>, <a href="https://arxiv.org/format/2503.08035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Group Preference Alignment: Customized LLM Response Generation from In-Situ Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mondal%2C+I">Ishani Mondal</a>, <a href="/search/cs?searchtype=author&query=Stokes%2C+J+W">Jack W. Stokes</a>, <a href="/search/cs?searchtype=author&query=Jauhar%2C+S+K">Sujay Kumar Jauhar</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Longqi Yang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mengting Wan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaofeng Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xia Song</a>, <a href="/search/cs?searchtype=author&query=Neville%2C+J">Jennifer Neville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08035v1-abstract-short" style="display: inline;"> LLMs often fail to meet the specialized needs of distinct user groups due to their one-size-fits-all training paradigm \cite{lucy-etal-2024-one} and there is limited research on what personalization aspects each group expect. To address these limitations, we propose a group-aware personalization framework, Group Preference Alignment (GPA), that identifies context-specific variations in conversatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08035v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08035v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08035v1-abstract-full" style="display: none;"> LLMs often fail to meet the specialized needs of distinct user groups due to their one-size-fits-all training paradigm \cite{lucy-etal-2024-one} and there is limited research on what personalization aspects each group expect. To address these limitations, we propose a group-aware personalization framework, Group Preference Alignment (GPA), that identifies context-specific variations in conversational preferences across user groups and then steers LLMs to address those preferences. Our approach consists of two steps: (1) Group-Aware Preference Extraction, where maximally divergent user-group preferences are extracted from real-world conversation logs and distilled into interpretable rubrics, and (2) Tailored Response Generation, which leverages these rubrics through two methods: a) Context-Tuned Inference (GAP-CT), that dynamically adjusts responses via context-dependent prompt instructions, and b) Rubric-Finetuning Inference (GPA-FT), which uses the rubrics to generate contrastive synthetic data for personalization of group-specific models via alignment. Experiments demonstrate that our framework significantly improves alignment of the output with respect to user preferences and outperforms baseline methods, while maintaining robust performance on standard benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08035v1-abstract-full').style.display = 'none'; document.getElementById('2503.08035v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07961">arXiv:2503.07961</a> <span> [<a href="https://arxiv.org/pdf/2503.07961">pdf</a>, <a href="https://arxiv.org/format/2503.07961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Overlap-aware meta-learning attention to enhance hypergraph neural networks for node classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+M">Murong Yang</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+S">Shihui Ying</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin-Jian Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07961v1-abstract-short" style="display: inline;"> Although hypergraph neural networks (HGNNs) have emerged as a powerful framework for analyzing complex datasets, their practical performance often remains limited. On one hand, existing networks typically employ a single type of attention mechanism, focusing on either structural or feature similarities during message passing. On the other hand, assuming that all nodes in current hypergraph models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07961v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07961v1-abstract-full" style="display: none;"> Although hypergraph neural networks (HGNNs) have emerged as a powerful framework for analyzing complex datasets, their practical performance often remains limited. On one hand, existing networks typically employ a single type of attention mechanism, focusing on either structural or feature similarities during message passing. On the other hand, assuming that all nodes in current hypergraph models have the same level of overlap may lead to suboptimal generalization. To overcome these limitations, we propose a novel framework, overlap-aware meta-learning attention for hypergraph neural networks (OMA-HGNN). First, we introduce a hypergraph attention mechanism that integrates both structural and feature similarities. Specifically, we linearly combine their respective losses with weighted factors for the HGNN model. Second, we partition nodes into different tasks based on their diverse overlap levels and develop a multi-task Meta-Weight-Net (MWN) to determine the corresponding weighted factors. Third, we jointly train the internal MWN model with the losses from the external HGNN model and train the external model with the weighted factors from the internal model. To evaluate the effectiveness of OMA-HGNN, we conducted experiments on six real-world datasets and benchmarked its perfor-mance against nine state-of-the-art methods for node classification. The results demonstrate that OMA-HGNN excels in learning superior node representations and outperforms these baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07961v1-abstract-full').style.display = 'none'; document.getElementById('2503.07961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">latex, 45 pages, 5 figures, 3 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07959">arXiv:2503.07959</a> <span> [<a href="https://arxiv.org/pdf/2503.07959">pdf</a>, <a href="https://arxiv.org/format/2503.07959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances in Hypergraph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+M">Murong Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin-Jian Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07959v1-abstract-short" style="display: inline;"> The growing interest in hypergraph neural networks (HGNNs) is driven by their capacity to capture the complex relationships and patterns within hypergraph structured data across various domains, including computer vision, complex networks, and natural language processing. This paper comprehensively reviews recent advances in HGNNs and presents a taxonomy of mainstream models based on their archite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07959v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07959v1-abstract-full" style="display: none;"> The growing interest in hypergraph neural networks (HGNNs) is driven by their capacity to capture the complex relationships and patterns within hypergraph structured data across various domains, including computer vision, complex networks, and natural language processing. This paper comprehensively reviews recent advances in HGNNs and presents a taxonomy of mainstream models based on their architectures: hypergraph convolutional networks (HGCNs), hypergraph attention networks (HGATs), hypergraph autoencoders (HGAEs), hypergraph recurrent networks (HGRNs), and deep hypergraph generative models (DHGGMs). For each category, we delve into its practical applications, mathematical mechanisms, literature contributions, and open problems. Finally, we discuss some common challenges and promising research directions.This paper aspires to be a helpful resource that provides guidance for future research and applications of HGNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07959v1-abstract-full').style.display = 'none'; document.getElementById('2503.07959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Latex, 35 pages, 1 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07536">arXiv:2503.07536</a> <span> [<a href="https://arxiv.org/pdf/2503.07536">pdf</a>, <a href="https://arxiv.org/format/2503.07536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LMM-R1: Empowering 3B LMMs with Strong Reasoning Abilities Through Two-Stage Rule-Based RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yingzhe Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaosen Zhang</a>, <a href="/search/cs?searchtype=author&query=You%2C+Z">Zhiyuan You</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qipeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xingzhong Xu</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xin Geng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07536v2-abstract-short" style="display: inline;"> Enhancing reasoning in Large Multimodal Models (LMMs) faces unique challenges from the complex interplay between visual perception and logical reasoning, particularly in compact 3B-parameter architectures where architectural constraints limit reasoning capacity and modality alignment. While rule-based reinforcement learning (RL) excels in text-only domains, its multimodal extension confronts two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07536v2-abstract-full').style.display = 'inline'; document.getElementById('2503.07536v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07536v2-abstract-full" style="display: none;"> Enhancing reasoning in Large Multimodal Models (LMMs) faces unique challenges from the complex interplay between visual perception and logical reasoning, particularly in compact 3B-parameter architectures where architectural constraints limit reasoning capacity and modality alignment. While rule-based reinforcement learning (RL) excels in text-only domains, its multimodal extension confronts two critical barriers: (1) data limitations due to ambiguous answers and scarce complex reasoning examples, and (2) degraded foundational reasoning induced by multimodal pretraining. To address these challenges, we propose \textbf{LMM-R1}, a two-stage framework adapting rule-based RL for multimodal reasoning through \textbf{Foundational Reasoning Enhancement (FRE)} followed by \textbf{Multimodal Generalization Training (MGT)}. The FRE stage first strengthens reasoning abilities using text-only data with rule-based RL, then the MGT stage generalizes these reasoning capabilities to multimodal domains. Experiments on Qwen2.5-VL-Instruct-3B demonstrate that LMM-R1 achieves 4.83\% and 4.5\% average improvements over baselines in multimodal and text-only benchmarks, respectively, with a 3.63\% gain in complex Football Game tasks. These results validate that text-based reasoning enhancement enables effective multimodal generalization, offering a data-efficient paradigm that bypasses costly high-quality multimodal training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07536v2-abstract-full').style.display = 'none'; document.getElementById('2503.07536v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository