Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,254 results for author: <span class="mathjax">Huang, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Huang%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20749">arXiv:2503.20749</a> <span> [<a href="https://arxiv.org/pdf/2503.20749">pdf</a>, <a href="https://arxiv.org/format/2503.20749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Believability: Accurate Human Behavior Simulation with Fine-Tuned LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuxuan Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jing Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yan Han</a>, <a href="/search/cs?searchtype=author&query=Bei%2C+B">Bennet Bei</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yaochen Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dakuo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jessie Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qi He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20749v1-abstract-short" style="display: inline;"> Recent research shows that LLMs can simulate ``believable'' human behaviors to power LLM agents via prompt-only methods. In this work, we focus on evaluating and improving LLM's objective ``accuracy'' rather than the subjective ``believability'' in the web action generation task, leveraging a large-scale, real-world dataset collected from online shopping human actions. We present the first compreh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20749v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20749v1-abstract-full" style="display: none;"> Recent research shows that LLMs can simulate ``believable'' human behaviors to power LLM agents via prompt-only methods. In this work, we focus on evaluating and improving LLM's objective ``accuracy'' rather than the subjective ``believability'' in the web action generation task, leveraging a large-scale, real-world dataset collected from online shopping human actions. We present the first comprehensive quantitative evaluation of state-of-the-art LLMs (e.g., DeepSeek-R1, Llama, and Claude) on the task of web action generation. Our results show that fine-tuning LLMs on real-world behavioral data substantially improves their ability to generate actions compared to prompt-only methods. Furthermore, incorporating synthesized reasoning traces into model training leads to additional performance gains, demonstrating the value of explicit rationale in behavior modeling. This work establishes a new benchmark for evaluating LLMs in behavior simulation and offers actionable insights into how real-world action data and reasoning augmentation can enhance the fidelity of LLM agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20749v1-abstract-full').style.display = 'none'; document.getElementById('2503.20749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20322">arXiv:2503.20322</a> <span> [<a href="https://arxiv.org/pdf/2503.20322">pdf</a>, <a href="https://arxiv.org/format/2503.20322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Pyramid Network for Efficient Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ai%2C+H">Hao Ai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kunyi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zezhou Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hao Lu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jin Tian</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yaxin Luo</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+P">Peng Xing</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-Yuan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaxia Li</a>, <a href="/search/cs?searchtype=author&query=luo%2C+G">Gen luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20322v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in various vision-language (VL) tasks, but their expensive computations still limit the real-world application. To address this issue, recent efforts aim to compress the visual features to save the computational costs of MLLMs. However, direct visual compression methods, e.g. efficient projectors, inevitably destroy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20322v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20322v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in various vision-language (VL) tasks, but their expensive computations still limit the real-world application. To address this issue, recent efforts aim to compress the visual features to save the computational costs of MLLMs. However, direct visual compression methods, e.g. efficient projectors, inevitably destroy the visual semantics in MLLM, especially in difficult samples. To overcome this shortcoming, we propose a novel dynamic pyramid network (DPN) for efficient MLLMs. Specifically, DPN formulates MLLM as a hierarchical structure where visual features are gradually compressed with increasing depth. In this case, even with a high compression ratio, fine-grained visual information can still be perceived in shallow layers. To maximize the benefit of DPN, we further propose an innovative Dynamic Pooling Experts (DPE) that can dynamically choose the optimal visual compression rate according to input features. With this design, harder samples will be assigned larger computations, thus preserving the model performance. To validate our approach, we conduct extensive experiments on two popular MLLMs and ten benchmarks. Experimental results show that DPN can save up to 56% average FLOPs on LLaVA while further achieving +0.74% performance gains. Besides, the generalization ability of DPN is also validated on the existing high-resolution MLLM called LLaVA-HR. Our source codes are anonymously released at https://github.com/aihao2000/DPN-LLaVA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20322v1-abstract-full').style.display = 'none'; document.getElementById('2503.20322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20263">arXiv:2503.20263</a> <span> [<a href="https://arxiv.org/pdf/2503.20263">pdf</a>, <a href="https://arxiv.org/format/2503.20263">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> L4: Diagnosing Large-scale LLM Training Failures via Automated Log Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuangbin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yichen Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guangba Yu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Cong Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yongqiang Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zengyin Yang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+M+R">Michael R. Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20263v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) show their capabilities across various applications, training customized LLMs has become essential for modern enterprises. However, due to the complexity of LLM training, which requires massive computational resources and extensive training time, failures are inevitable during the training process. These failures result in considerable waste of resource and time, hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20263v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20263v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20263v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) show their capabilities across various applications, training customized LLMs has become essential for modern enterprises. However, due to the complexity of LLM training, which requires massive computational resources and extensive training time, failures are inevitable during the training process. These failures result in considerable waste of resource and time, highlighting the critical need for effective and efficient failure diagnosis to reduce the cost of LLM training. In this paper, we present the first empirical study on the failure reports of 428 LLM training failures in our production Platform-X between May 2023 and April 2024. Our study reveals that hardware and user faults are the predominant root causes, and current diagnosis processes rely heavily on training logs. Unfortunately, existing log-based diagnostic methods fall short in handling LLM training logs. Considering the unique features of LLM training, we identify three distinct patterns of LLM training logs: cross-job, spatial, and temporal patterns. We then introduce our Log-based Large-scale LLM training failure diagnosis framework, L4, which can automatically extract failure-indicating information (i.e., log events, nodes, stages, and iterations) from extensive training logs, thereby reducing manual effort and facilitating failure recovery. Experimental results on real-world datasets show that L4 outperforms existing approaches in identifying failure-indicating logs and localizing faulty nodes. Furthermore, L4 has been applied in Platform-X and demonstrated its effectiveness in enabling accurate and efficient failure diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20263v1-abstract-full').style.display = 'none'; document.getElementById('2503.20263v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in companion proceedings of the 33rd ACM International Conference on the Foundations of Software Engineering (FSE'25). 13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20244">arXiv:2503.20244</a> <span> [<a href="https://arxiv.org/pdf/2503.20244">pdf</a>, <a href="https://arxiv.org/format/2503.20244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Software Vulnerability Analysis Across Programming Language and Program Representation Landscapes: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zhuoyun Qian</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+F">Fangtian Zhong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qin Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yili Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaqi Huang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengfei Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiguo Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20244v1-abstract-short" style="display: inline;"> Modern software systems are developed in diverse programming languages and often harbor critical vulnerabilities that attackers can exploit to compromise security. These vulnerabilities have been actively targeted in real-world attacks, causing substantial harm to users and cyberinfrastructure. Since many of these flaws originate from the code itself, a variety of techniques have been proposed to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20244v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20244v1-abstract-full" style="display: none;"> Modern software systems are developed in diverse programming languages and often harbor critical vulnerabilities that attackers can exploit to compromise security. These vulnerabilities have been actively targeted in real-world attacks, causing substantial harm to users and cyberinfrastructure. Since many of these flaws originate from the code itself, a variety of techniques have been proposed to detect and mitigate them prior to software deployment. However, a comprehensive comparative study that spans different programming languages, program representations, bug types, and analysis techniques is still lacking. As a result, the relationships among programming languages, abstraction levels, vulnerability types, and detection approaches remain fragmented, and the limitations and research gaps across the landscape are not clearly understood. This article aims to bridge that gap by systematically examining widely used programming languages, levels of program representation, categories of vulnerabilities, and mainstream detection techniques. The survey provides a detailed understanding of current practices in vulnerability discovery, highlighting their strengths, limitations, and distinguishing characteristics. Furthermore, it identifies persistent challenges and outlines promising directions for future research in the field of software security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20244v1-abstract-full').style.display = 'none'; document.getElementById('2503.20244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20168">arXiv:2503.20168</a> <span> [<a href="https://arxiv.org/pdf/2503.20168">pdf</a>, <a href="https://arxiv.org/format/2503.20168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EVolSplat: Efficient Volume-based Gaussian Splatting for Urban View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+S">Sheng Miao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaxin Huang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+D">Dongfeng Bai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Geiger%2C+A">Andreas Geiger</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiyi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20168v1-abstract-short" style="display: inline;"> Novel view synthesis of urban scenes is essential for autonomous driving-related applications.Existing NeRF and 3DGS-based methods show promising results in achieving photorealistic renderings but require slow, per-scene optimization. We introduce EVolSplat, an efficient 3D Gaussian Splatting model for urban scenes that works in a feed-forward manner. Unlike existing feed-forward, pixel-aligned 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20168v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20168v1-abstract-full" style="display: none;"> Novel view synthesis of urban scenes is essential for autonomous driving-related applications.Existing NeRF and 3DGS-based methods show promising results in achieving photorealistic renderings but require slow, per-scene optimization. We introduce EVolSplat, an efficient 3D Gaussian Splatting model for urban scenes that works in a feed-forward manner. Unlike existing feed-forward, pixel-aligned 3DGS methods, which often suffer from issues like multi-view inconsistencies and duplicated content, our approach predicts 3D Gaussians across multiple frames within a unified volume using a 3D convolutional network. This is achieved by initializing 3D Gaussians with noisy depth predictions, and then refining their geometric properties in 3D space and predicting color based on 2D textures. Our model also handles distant views and the sky with a flexible hemisphere background model. This enables us to perform fast, feed-forward reconstruction while achieving real-time rendering. Experimental evaluations on the KITTI-360 and Waymo datasets show that our method achieves state-of-the-art quality compared to existing feed-forward 3DGS- and NeRF-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20168v1-abstract-full').style.display = 'none'; document.getElementById('2503.20168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18640">arXiv:2503.18640</a> <span> [<a href="https://arxiv.org/pdf/2503.18640">pdf</a>, <a href="https://arxiv.org/format/2503.18640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLGS: Unsupervised Gaussian Splatting for Image Enhancement and Reconstruction in Pure Dark Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwei Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lu Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+T">Tianchen Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gaojing Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingrui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18640v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting has shown remarkable capabilities in novel view rendering tasks and exhibits significant potential for multi-view optimization.However, the original 3D Gaussian Splatting lacks color representation for inputs in low-light environments. Simply using enhanced images as inputs would lead to issues with multi-view consistency, and current single-view enhancement systems rely on p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18640v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18640v1-abstract-full" style="display: none;"> 3D Gaussian Splatting has shown remarkable capabilities in novel view rendering tasks and exhibits significant potential for multi-view optimization.However, the original 3D Gaussian Splatting lacks color representation for inputs in low-light environments. Simply using enhanced images as inputs would lead to issues with multi-view consistency, and current single-view enhancement systems rely on pre-trained data, lacking scene generalization. These problems limit the application of 3D Gaussian Splatting in low-light conditions in the field of robotics, including high-fidelity modeling and feature matching. To address these challenges, we propose an unsupervised multi-view stereoscopic system based on Gaussian Splatting, called Low-Light Gaussian Splatting (LLGS). This system aims to enhance images in low-light environments while reconstructing the scene. Our method introduces a decomposable Gaussian representation called M-Color, which separately characterizes color information for targeted enhancement. Furthermore, we propose an unsupervised optimization method with zero-knowledge priors, using direction-based enhancement to ensure multi-view consistency. Experiments conducted on real-world datasets demonstrate that our system outperforms state-of-the-art methods in both low-light enhancement and 3D Gaussian Splatting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18640v1-abstract-full').style.display = 'none'; document.getElementById('2503.18640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18168">arXiv:2503.18168</a> <span> [<a href="https://arxiv.org/pdf/2503.18168">pdf</a>, <a href="https://arxiv.org/format/2503.18168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Strategic Prompt Pricing for AIGC Services: A User-Centric Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bing Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianwei Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuan Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18168v1-abstract-short" style="display: inline;"> The rapid growth of AI-generated content (AIGC) services has created an urgent need for effective prompt pricing strategies, yet current approaches overlook users' strategic two-step decision-making process in selecting and utilizing generative AI models. This oversight creates two key technical challenges: quantifying the relationship between user prompt capabilities and generation outcomes, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18168v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18168v1-abstract-full" style="display: none;"> The rapid growth of AI-generated content (AIGC) services has created an urgent need for effective prompt pricing strategies, yet current approaches overlook users' strategic two-step decision-making process in selecting and utilizing generative AI models. This oversight creates two key technical challenges: quantifying the relationship between user prompt capabilities and generation outcomes, and optimizing platform payoff while accounting for heterogeneous user behaviors. We address these challenges by introducing prompt ambiguity, a theoretical framework that captures users' varying abilities in prompt engineering, and developing an Optimal Prompt Pricing (OPP) algorithm. Our analysis reveals a counterintuitive insight: users with higher prompt ambiguity (i.e., lower capability) exhibit non-monotonic prompt usage patterns, first increasing then decreasing with ambiguity levels, reflecting complex changes in marginal utility. Experimental evaluation using a character-level GPT-like model demonstrates that our OPP algorithm achieves up to 31.72% improvement in platform payoff compared to existing pricing mechanisms, validating the importance of user-centric prompt pricing in AIGC services. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18168v1-abstract-full').style.display = 'none'; document.getElementById('2503.18168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted in WiOpt 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18135">arXiv:2503.18135</a> <span> [<a href="https://arxiv.org/pdf/2503.18135">pdf</a>, <a href="https://arxiv.org/format/2503.18135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MLLM-For3D: Adapting Multimodal Large Language Model for 3D Reasoning Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaxin Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runnan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziwen Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhengqing Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiao He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yandong Guo</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Mingming Gong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18135v1-abstract-short" style="display: inline;"> Reasoning segmentation aims to segment target objects in complex scenes based on human intent and spatial reasoning. While recent multimodal large language models (MLLMs) have demonstrated impressive 2D image reasoning segmentation, adapting these capabilities to 3D scenes remains underexplored. In this paper, we introduce MLLM-For3D, a simple yet effective framework that transfers knowledge from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18135v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18135v1-abstract-full" style="display: none;"> Reasoning segmentation aims to segment target objects in complex scenes based on human intent and spatial reasoning. While recent multimodal large language models (MLLMs) have demonstrated impressive 2D image reasoning segmentation, adapting these capabilities to 3D scenes remains underexplored. In this paper, we introduce MLLM-For3D, a simple yet effective framework that transfers knowledge from 2D MLLMs to 3D scene understanding. Specifically, we utilize MLLMs to generate multi-view pseudo segmentation masks and corresponding text embeddings, then unproject 2D masks into 3D space and align them with the text embeddings. The primary challenge lies in the absence of 3D context and spatial consistency across multiple views, causing the model to hallucinate objects that do not exist and fail to target objects consistently. Training the 3D model with such irrelevant objects leads to performance degradation. To address this, we introduce a spatial consistency strategy to enforce that segmentation masks remain coherent in the 3D space, effectively capturing the geometry of the scene. Moreover, we develop a Token-for-Query approach for multimodal semantic alignment, enabling consistent identification of the same object across different views. Extensive evaluations on various challenging indoor scene benchmarks demonstrate that, even without any labeled 3D training data, MLLM-For3D outperforms existing 3D reasoning segmentation methods, effectively interpreting user intent, understanding 3D scenes, and reasoning about spatial relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18135v1-abstract-full').style.display = 'none'; document.getElementById('2503.18135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17924">arXiv:2503.17924</a> <span> [<a href="https://arxiv.org/pdf/2503.17924">pdf</a>, <a href="https://arxiv.org/format/2503.17924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WLB-LLM: Workload-Balanced 4D Parallelism for Large Language Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+A">Anna Cai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xinfeng Xie</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zaifeng Pan</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Y">Yue Guan</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+W">Weiwei Chu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shikai Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianyu Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chris Cai</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuchen Hao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yufei Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17924v1-abstract-short" style="display: inline;"> In this work, we present WLB-LLM, a workLoad-balanced 4D parallelism for large language model training. We first thoroughly analyze the workload imbalance issue in LLM training and identify two primary sources of imbalance at the pipeline parallelism and context parallelism levels. Then, to address the imbalance issue, at the pipeline parallelism level, WLB-LLM incorporates a workload-aware variab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17924v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17924v1-abstract-full" style="display: none;"> In this work, we present WLB-LLM, a workLoad-balanced 4D parallelism for large language model training. We first thoroughly analyze the workload imbalance issue in LLM training and identify two primary sources of imbalance at the pipeline parallelism and context parallelism levels. Then, to address the imbalance issue, at the pipeline parallelism level, WLB-LLM incorporates a workload-aware variable-length document packing method to balance the computation and communication workload across micro-batches. Additionally, at the context parallelism level, WLB-LLM introduces a novel fine-grained per-document sharding strategy, ensuring each worker within a context parallelism group has an identical workload. Comprehensive experiments under different model scales demonstrate that WLB-LLM significantly mitigates the workload imbalance during 4D parallelism LLM training and achieves an average speedup of 1.23x when applying WLB-LLM in our internal LLM training framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17924v1-abstract-full').style.display = 'none'; document.getElementById('2503.17924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 16 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16456">arXiv:2503.16456</a> <span> [<a href="https://arxiv.org/pdf/2503.16456">pdf</a>, <a href="https://arxiv.org/format/2503.16456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Position: Beyond Assistance -- Reimagining LLMs as Ethical and Adaptive Co-Creators in Mental Health Care </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Badawi%2C+A">Abeer Badawi</a>, <a href="/search/cs?searchtype=author&query=Laskar%2C+M+T+R">Md Tahmid Rahman Laskar</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J+X">Jimmy Xiangji Huang</a>, <a href="/search/cs?searchtype=author&query=Raza%2C+S">Shaina Raza</a>, <a href="/search/cs?searchtype=author&query=Dolatabadi%2C+E">Elham Dolatabadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16456v1-abstract-short" style="display: inline;"> This position paper argues for a fundamental shift in how Large Language Models (LLMs) are integrated into the mental health care domain. We advocate for their role as co-creators rather than mere assistive tools. While LLMs have the potential to enhance accessibility, personalization, and crisis intervention, their adoption remains limited due to concerns about bias, evaluation, over-reliance, de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16456v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16456v1-abstract-full" style="display: none;"> This position paper argues for a fundamental shift in how Large Language Models (LLMs) are integrated into the mental health care domain. We advocate for their role as co-creators rather than mere assistive tools. While LLMs have the potential to enhance accessibility, personalization, and crisis intervention, their adoption remains limited due to concerns about bias, evaluation, over-reliance, dehumanization, and regulatory uncertainties. To address these challenges, we propose two structured pathways: SAFE-i (Supportive, Adaptive, Fair, and Ethical Implementation) Guidelines for ethical and responsible deployment, and HAAS-e (Human-AI Alignment and Safety Evaluation) Framework for multidimensional, human-centered assessment. SAFE-i provides a blueprint for data governance, adaptive model engineering, and real-world integration, ensuring LLMs align with clinical and ethical standards. HAAS-e introduces evaluation metrics that go beyond technical accuracy to measure trustworthiness, empathy, cultural sensitivity, and actionability. We call for the adoption of these structured approaches to establish a responsible and scalable model for LLM-driven mental health support, ensuring that AI complements-rather than replaces-human expertise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16456v1-abstract-full').style.display = 'none'; document.getElementById('2503.16456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16328">arXiv:2503.16328</a> <span> [<a href="https://arxiv.org/pdf/2503.16328">pdf</a>, <a href="https://arxiv.org/format/2503.16328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-guided machine learning model with soil moisture for corn yield prediction under drought conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yijia Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingyi Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengwei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhou Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16328v1-abstract-short" style="display: inline;"> Remote sensing (RS) techniques, by enabling non-contact acquisition of extensive ground observations, have become a valuable tool for corn yield prediction. Traditional process-based (PB) models are limited by fixed input features and struggle to incorporate large volumes of RS data. In contrast, machine learning (ML) models are often criticized for being ``black boxes'' with limited interpretabil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16328v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16328v1-abstract-full" style="display: none;"> Remote sensing (RS) techniques, by enabling non-contact acquisition of extensive ground observations, have become a valuable tool for corn yield prediction. Traditional process-based (PB) models are limited by fixed input features and struggle to incorporate large volumes of RS data. In contrast, machine learning (ML) models are often criticized for being ``black boxes'' with limited interpretability. To address these limitations, we used Knowledge-Guided Machine Learning (KGML), which combined the strengths of both approaches and fully used RS data. However, previous KGML methods overlooked the crucial role of soil moisture in plant growth. To bridge this gap, we proposed the Knowledge-Guided Machine Learning with Soil Moisture (KGML-SM) framework, using soil moisture as an intermediate variable to emphasize its key role in plant development. Additionally, based on the prior knowledge that the model may overestimate under drought conditions, we designed a drought-aware loss function that penalizes predicted yield in drought-affected areas. Our experiments showed that the KGML-SM model outperformed other ML models. Finally, we explored the relationships between drought, soil moisture, and corn yield prediction, assessing the importance of various features and analyzing how soil moisture impacts corn yield predictions across different regions and time periods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16328v1-abstract-full').style.display = 'none'; document.getElementById('2503.16328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16302">arXiv:2503.16302</a> <span> [<a href="https://arxiv.org/pdf/2503.16302">pdf</a>, <a href="https://arxiv.org/format/2503.16302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing Vecset Diffusion Model for Fast Shape Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zeqiang Lai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuyun Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huiwen Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianghui Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingxiang Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhong Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chunchao Guo</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16302v2-abstract-short" style="display: inline;"> 3D shape generation has greatly flourished through the development of so-called "native" 3D diffusion, particularly through the Vecset Diffusion Model (VDM). While recent advancements have shown promising results in generating high-resolution 3D shapes, VDM still struggles with high-speed generation. Challenges exist because of difficulties not only in accelerating diffusion sampling but also VAE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16302v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16302v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16302v2-abstract-full" style="display: none;"> 3D shape generation has greatly flourished through the development of so-called "native" 3D diffusion, particularly through the Vecset Diffusion Model (VDM). While recent advancements have shown promising results in generating high-resolution 3D shapes, VDM still struggles with high-speed generation. Challenges exist because of difficulties not only in accelerating diffusion sampling but also VAE decoding in VDM, areas under-explored in previous works. To address these challenges, we present FlashVDM, a systematic framework for accelerating both VAE and DiT in VDM. For DiT, FlashVDM enables flexible diffusion sampling with as few as 5 inference steps and comparable quality, which is made possible by stabilizing consistency distillation with our newly introduced Progressive Flow Distillation. For VAE, we introduce a lightning vecset decoder equipped with Adaptive KV Selection, Hierarchical Volume Decoding, and Efficient Network Design. By exploiting the locality of the vecset and the sparsity of shape surface in the volume, our decoder drastically lowers FLOPs, minimizing the overall decoding overhead. We apply FlashVDM to Hunyuan3D-2 to obtain Hunyuan3D-2 Turbo. Through systematic evaluation, we show that our model significantly outperforms existing fast 3D generation methods, achieving comparable performance to the state-of-the-art while reducing inference time by over 45x for reconstruction and 32x for generation. Code and models are available at https://github.com/Tencent/FlashVDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16302v2-abstract-full').style.display = 'none'; document.getElementById('2503.16302v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13813">arXiv:2503.13813</a> <span> [<a href="https://arxiv.org/pdf/2503.13813">pdf</a>, <a href="https://arxiv.org/format/2503.13813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Automatic MILP Model Construction for Multi-Robot Task Allocation and Scheduling Based on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+M">Mingming Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhendong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhengqi Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qihao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Liang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13813v1-abstract-short" style="display: inline;"> With the accelerated development of Industry 4.0, intelligent manufacturing systems increasingly require efficient task allocation and scheduling in multi-robot systems. However, existing methods rely on domain expertise and face challenges in adapting to dynamic production constraints. Additionally, enterprises have high privacy requirements for production scheduling data, which prevents the use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13813v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13813v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13813v1-abstract-full" style="display: none;"> With the accelerated development of Industry 4.0, intelligent manufacturing systems increasingly require efficient task allocation and scheduling in multi-robot systems. However, existing methods rely on domain expertise and face challenges in adapting to dynamic production constraints. Additionally, enterprises have high privacy requirements for production scheduling data, which prevents the use of cloud-based large language models (LLMs) for solution development. To address these challenges, there is an urgent need for an automated modeling solution that meets data privacy requirements. This study proposes a knowledge-augmented mixed integer linear programming (MILP) automated formulation framework, integrating local LLMs with domain-specific knowledge bases to generate executable code from natural language descriptions automatically. The framework employs a knowledge-guided DeepSeek-R1-Distill-Qwen-32B model to extract complex spatiotemporal constraints (82% average accuracy) and leverages a supervised fine-tuned Qwen2.5-Coder-7B-Instruct model for efficient MILP code generation (90% average accuracy). Experimental results demonstrate that the framework successfully achieves automatic modeling in the aircraft skin manufacturing case while ensuring data privacy and computational efficiency. This research provides a low-barrier and highly reliable technical path for modeling in complex industrial scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13813v1-abstract-full').style.display = 'none'; document.getElementById('2503.13813v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13447">arXiv:2503.13447</a> <span> [<a href="https://arxiv.org/pdf/2503.13447">pdf</a>, <a href="https://arxiv.org/format/2503.13447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MetaScale: Test-Time Scaling with Evolving Meta-Thoughts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J+Y">James Y. Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+H">Hoifung Poon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13447v1-abstract-short" style="display: inline;"> One critical challenge for large language models (LLMs) for making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13447v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13447v1-abstract-full" style="display: none;"> One critical challenge for large language models (LLMs) for making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To address this limitation, we introduce METASCALE, a test-time scaling framework based on meta-thoughts -- adaptive thinking strategies tailored to each task. METASCALE initializes a pool of candidate meta-thoughts, then iteratively selects and evaluates them using a multi-armed bandit algorithm with upper confidence bound selection, guided by a reward model. To further enhance adaptability, a genetic algorithm evolves high-reward meta-thoughts, refining and extending the strategy pool over time. By dynamically proposing and optimizing meta-thoughts at inference time, METASCALE improves both accuracy and generalization across a wide range of tasks. Experimental results demonstrate that MetaScale consistently outperforms standard inference approaches, achieving an 11% performance gain in win rate on Arena-Hard for GPT-4o, surpassing o1-mini by 0.9% under style control. Notably, METASCALE scales more effectively with increasing sampling budgets and produces more structured, expert-level responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13447v1-abstract-full').style.display = 'none'; document.getElementById('2503.13447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13250">arXiv:2503.13250</a> <span> [<a href="https://arxiv.org/pdf/2503.13250">pdf</a>, <a href="https://arxiv.org/format/2503.13250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> MindEye-OmniAssist: A Gaze-Driven LLM-Enhanced Assistive Robot System for Implicit Intention Recognition and Task Execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zejia Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinxing Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weizhuang Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13250v1-abstract-short" style="display: inline;"> A promising effective human-robot interaction in assistive robotic systems is gaze-based control. However, current gaze-based assistive systems mainly help users with basic grasping actions, offering limited support. Moreover, the restricted intent recognition capability constrains the assistive system's ability to provide diverse assistance functions. In this paper, we propose an open implicit in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13250v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13250v1-abstract-full" style="display: none;"> A promising effective human-robot interaction in assistive robotic systems is gaze-based control. However, current gaze-based assistive systems mainly help users with basic grasping actions, offering limited support. Moreover, the restricted intent recognition capability constrains the assistive system's ability to provide diverse assistance functions. In this paper, we propose an open implicit intention recognition framework powered by Large Language Model (LLM) and Vision Foundation Model (VFM), which can process gaze input and recognize user intents that are not confined to predefined or specific scenarios. Furthermore, we implement a gaze-driven LLM-enhanced assistive robot system (MindEye-OmniAssist) that recognizes user's intentions through gaze and assists in completing task. To achieve this, the system utilizes open vocabulary object detector, intention recognition network and LLM to infer their full intentions. By integrating eye movement feedback and LLM, it generates action sequences to assist the user in completing tasks. Real-world experiments have been conducted for assistive tasks, and the system achieved an overall success rate of 41/55 across various undefined tasks. Preliminary results show that the proposed method holds the potential to provide a more user-friendly human-computer interaction interface and significantly enhance the versatility and effectiveness of assistive systems by supporting more complex and diverse task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13250v1-abstract-full').style.display = 'none'; document.getElementById('2503.13250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13214">arXiv:2503.13214</a> <span> [<a href="https://arxiv.org/pdf/2503.13214">pdf</a>, <a href="https://arxiv.org/format/2503.13214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A General Adaptive Dual-level Weighting Mechanism for Remote Sensing Pansharpening </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haorui Chen</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jiaxuan Ren</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Siran Peng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Liangjian Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13214v3-abstract-short" style="display: inline;"> Currently, deep learning-based methods for remote sensing pansharpening have advanced rapidly. However, many existing methods struggle to fully leverage feature heterogeneity and redundancy, thereby limiting their effectiveness. We use the covariance matrix to model the feature heterogeneity and redundancy and propose Correlation-Aware Covariance Weighting (CACW) to adjust them. CACW captures thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13214v3-abstract-full').style.display = 'inline'; document.getElementById('2503.13214v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13214v3-abstract-full" style="display: none;"> Currently, deep learning-based methods for remote sensing pansharpening have advanced rapidly. However, many existing methods struggle to fully leverage feature heterogeneity and redundancy, thereby limiting their effectiveness. We use the covariance matrix to model the feature heterogeneity and redundancy and propose Correlation-Aware Covariance Weighting (CACW) to adjust them. CACW captures these correlations through the covariance matrix, which is then processed by a nonlinear function to generate weights for adjustment. Building upon CACW, we introduce a general adaptive dual-level weighting mechanism (ADWM) to address these challenges from two key perspectives, enhancing a wide range of existing deep-learning methods. First, Intra-Feature Weighting (IFW) evaluates correlations among channels within each feature to reduce redundancy and enhance unique information. Second, Cross-Feature Weighting (CFW) adjusts contributions across layers based on inter-layer correlations, refining the final output. Extensive experiments demonstrate the superior performance of ADWM compared to recent state-of-the-art (SOTA) methods. Furthermore, we validate the effectiveness of our approach through generality experiments, redundancy visualization, comparison experiments, key variables and complexity analysis, and ablation studies. Our code is available at https://github.com/Jie-1203/ADWM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13214v3-abstract-full').style.display = 'none'; document.getElementById('2503.13214v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted at the CVPR Conference on Computer Vision and Pattern Recognition 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13131">arXiv:2503.13131</a> <span> [<a href="https://arxiv.org/pdf/2503.13131">pdf</a>, <a href="https://arxiv.org/format/2503.13131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Patient-specific radiomic feature selection with reconstructed healthy persona of knee MR images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaxi Chen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+S">Simin Ni</a>, <a href="/search/cs?searchtype=author&query=Ivanova%2C+A">Aleksandra Ivanova</a>, <a href="/search/cs?searchtype=author&query=Saeed%2C+S+U">Shaheer U. Saeed</a>, <a href="/search/cs?searchtype=author&query=Hargunani%2C+R">Rikin Hargunani</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chaozong Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yipeng Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13131v1-abstract-short" style="display: inline;"> Classical radiomic features have been designed to describe image appearance and intensity patterns. These features are directly interpretable and readily understood by radiologists. Compared with end-to-end deep learning (DL) models, lower dimensional parametric models that use such radiomic features offer enhanced interpretability but lower comparative performance in clinical tasks. In this study… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13131v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13131v1-abstract-full" style="display: none;"> Classical radiomic features have been designed to describe image appearance and intensity patterns. These features are directly interpretable and readily understood by radiologists. Compared with end-to-end deep learning (DL) models, lower dimensional parametric models that use such radiomic features offer enhanced interpretability but lower comparative performance in clinical tasks. In this study, we propose an approach where a standard logistic regression model performance is substantially improved by learning to select radiomic features for individual patients, from a pool of candidate features. This approach has potentials to maintain the interpretability of such approaches while offering comparable performance to DL. We also propose to expand the feature pool by generating a patient-specific healthy persona via mask-inpainting using a denoising diffusion model trained on healthy subjects. Such a pathology-free baseline feature set allows further opportunity in novel feature discovery and improved condition classification. We demonstrate our method on multiple clinical tasks of classifying general abnormalities, anterior cruciate ligament tears, and meniscus tears. Experimental results demonstrate that our approach achieved comparable or even superior performance than state-of-the-art DL approaches while offering added interpretability by using radiomic features extracted from images and supplemented by generating healthy personas. Example clinical cases are discussed in-depth to demonstrate the intepretability-enabled utilities such as human-explainable feature discovery and patient-specific location/view selection. These findings highlight the potentials of the combination of subject-specific feature selection with generative models in augmenting radiomic analysis for more interpretable decision-making. The codes are available at: https://github.com/YaxiiC/RadiomicsPersona.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13131v1-abstract-full').style.display = 'none'; document.getElementById('2503.13131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12944">arXiv:2503.12944</a> <span> [<a href="https://arxiv.org/pdf/2503.12944">pdf</a>, <a href="https://arxiv.org/format/2503.12944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GIFT: Generated Indoor video frames for Texture-less point tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianzheng Huang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xianyu Mo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziling Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+F">Feng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12944v1-abstract-short" style="display: inline;"> Point tracking is becoming a powerful solver for motion estimation and video editing. Compared to classical feature matching, point tracking methods have the key advantage of robustly tracking points under complex camera motion trajectories and over extended periods. However, despite certain improvements in methodologies, current point tracking methods still struggle to track any position in video… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12944v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12944v1-abstract-full" style="display: none;"> Point tracking is becoming a powerful solver for motion estimation and video editing. Compared to classical feature matching, point tracking methods have the key advantage of robustly tracking points under complex camera motion trajectories and over extended periods. However, despite certain improvements in methodologies, current point tracking methods still struggle to track any position in video frames, especially in areas that are texture-less or weakly textured. In this work, we first introduce metrics for evaluating the texture intensity of a 3D object. Using these metrics, we classify the 3D models in ShapeNet into three levels of texture intensity and create GIFT, a challenging synthetic benchmark comprising 1800 indoor video sequences with rich annotations. Unlike existing datasets that assign ground truth points arbitrarily, GIFT precisely anchors ground truth on classified target objects, ensuring that each video corresponds to a specific texture intensity level. Furthermore, we comprehensively evaluate current methods on GIFT to assess their performance across different texture intensity levels and analyze the impact of texture on point tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12944v1-abstract-full').style.display = 'none'; document.getElementById('2503.12944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12937">arXiv:2503.12937</a> <span> [<a href="https://arxiv.org/pdf/2503.12937">pdf</a>, <a href="https://arxiv.org/format/2503.12937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> R1-VL: Learning to Reason with Multimodal Large Language Models via Step-wise Group Relative Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaxing Huang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huanjin Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shunyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xikun Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12937v1-abstract-short" style="display: inline;"> Recent studies generally enhance MLLMs' reasoning capabilities via supervised fine-tuning on high-quality chain-of-thought reasoning data, which often leads models to merely imitate successful reasoning paths without understanding what the wrong reasoning paths are. In this work, we aim to enhance the MLLMs' reasoning ability beyond passively imitating positive reasoning paths. To this end, we des… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12937v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12937v1-abstract-full" style="display: none;"> Recent studies generally enhance MLLMs' reasoning capabilities via supervised fine-tuning on high-quality chain-of-thought reasoning data, which often leads models to merely imitate successful reasoning paths without understanding what the wrong reasoning paths are. In this work, we aim to enhance the MLLMs' reasoning ability beyond passively imitating positive reasoning paths. To this end, we design Step-wise Group Relative Policy Optimization (StepGRPO), a new online reinforcement learning framework that enables MLLMs to self-improve reasoning ability via simple, effective and dense step-wise rewarding. Specifically, StepGRPO introduces two novel rule-based reasoning rewards: Step-wise Reasoning Accuracy Reward (StepRAR) and Step-wise Reasoning Validity Reward (StepRVR). StepRAR rewards the reasoning paths that contain necessary intermediate reasoning steps via a soft key-step matching technique, while StepRAR rewards reasoning paths that follow a well-structured and logically consistent reasoning process through a reasoning completeness and logic evaluation strategy. With the proposed StepGRPO, we introduce R1-VL, a series of MLLMs with outstanding capabilities in step-by-step reasoning. Extensive experiments over 8 benchmarks demonstrate the superiority of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12937v1-abstract-full').style.display = 'none'; document.getElementById('2503.12937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12838">arXiv:2503.12838</a> <span> [<a href="https://arxiv.org/pdf/2503.12838">pdf</a>, <a href="https://arxiv.org/format/2503.12838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamLayer: Simultaneous Multi-Layer Generation via Diffusion Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junjia Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pengxiang Yan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jinhang Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yitong Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12838v1-abstract-short" style="display: inline;"> Text-driven image generation using diffusion models has recently gained significant attention. To enable more flexible image manipulation and editing, recent research has expanded from single image generation to transparent layer generation and multi-layer compositions. However, existing approaches often fail to provide a thorough exploration of multi-layer structures, leading to inconsistent inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12838v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12838v1-abstract-full" style="display: none;"> Text-driven image generation using diffusion models has recently gained significant attention. To enable more flexible image manipulation and editing, recent research has expanded from single image generation to transparent layer generation and multi-layer compositions. However, existing approaches often fail to provide a thorough exploration of multi-layer structures, leading to inconsistent inter-layer interactions, such as occlusion relationships, spatial layout, and shadowing. In this paper, we introduce DreamLayer, a novel framework that enables coherent text-driven generation of multiple image layers, by explicitly modeling the relationship between transparent foreground and background layers. DreamLayer incorporates three key components, i.e., Context-Aware Cross-Attention (CACA) for global-local information exchange, Layer-Shared Self-Attention (LSSA) for establishing robust inter-layer connections, and Information Retained Harmonization (IRH) for refining fusion details at the latent level. By leveraging a coherent full-image context, DreamLayer builds inter-layer connections through attention mechanisms and applies a harmonization step to achieve seamless layer fusion. To facilitate research in multi-layer generation, we construct a high-quality, diverse multi-layer dataset including 400k samples. Extensive experiments and user studies demonstrate that DreamLayer generates more coherent and well-aligned layers, with broad applicability, including latent-space image editing and image-to-layer decomposition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12838v1-abstract-full').style.display = 'none'; document.getElementById('2503.12838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12784">arXiv:2503.12784</a> <span> [<a href="https://arxiv.org/pdf/2503.12784">pdf</a>, <a href="https://arxiv.org/format/2503.12784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Causal Feature Learning in the Social Sciences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingzhou Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiuyao Lu</a>, <a href="/search/cs?searchtype=author&query=Tolbert%2C+A+W">Alexander Williams Tolbert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12784v1-abstract-short" style="display: inline;"> Variable selection poses a significant challenge in causal modeling, particularly within the social sciences, where constructs often rely on inter-related factors such as age, socioeconomic status, gender, and race. Indeed, it has been argued that such attributes must be modeled as macro-level abstractions of lower-level manipulable features, in order to preserve the modularity assumption essentia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12784v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12784v1-abstract-full" style="display: none;"> Variable selection poses a significant challenge in causal modeling, particularly within the social sciences, where constructs often rely on inter-related factors such as age, socioeconomic status, gender, and race. Indeed, it has been argued that such attributes must be modeled as macro-level abstractions of lower-level manipulable features, in order to preserve the modularity assumption essential to causal inference. This paper accordingly extends the theoretical framework of Causal Feature Learning (CFL). Empirically, we apply the CFL algorithm to diverse social science datasets, evaluating how CFL-derived macrostates compare with traditional microstates in downstream modeling tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12784v1-abstract-full').style.display = 'none'; document.getElementById('2503.12784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12764">arXiv:2503.12764</a> <span> [<a href="https://arxiv.org/pdf/2503.12764">pdf</a>, <a href="https://arxiv.org/format/2503.12764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decouple to Reconstruct: High Quality UHD Restoration via Active Feature Disentanglement and Reversible Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yidi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuxin Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xueyang Fu</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12764v1-abstract-short" style="display: inline;"> Ultra-high-definition (UHD) image restoration often faces computational bottlenecks and information loss due to its extremely high resolution. Existing studies based on Variational Autoencoders (VAE) improve efficiency by transferring the image restoration process from pixel space to latent space. However, degraded components are inherently coupled with background elements in degraded images, both… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12764v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12764v1-abstract-full" style="display: none;"> Ultra-high-definition (UHD) image restoration often faces computational bottlenecks and information loss due to its extremely high resolution. Existing studies based on Variational Autoencoders (VAE) improve efficiency by transferring the image restoration process from pixel space to latent space. However, degraded components are inherently coupled with background elements in degraded images, both information loss during compression and information gain during compensation remain uncontrollable. These lead to restored images often exhibiting image detail loss and incomplete degradation removal. To address this issue, we propose a Controlled Differential Disentangled VAE, which utilizes Hierarchical Contrastive Disentanglement Learning and an Orthogonal Gated Projection Module to guide the VAE to actively discard easily recoverable background information while encoding more difficult-to-recover degraded information into the latent space. Additionally, we design a Complex Invertible Multiscale Fusion Network to handle background features, ensuring their consistency, and utilize a latent space restoration network to transform the degraded latent features, leading to more accurate restoration results. Extensive experimental results demonstrate that our method effectively alleviates the information loss problem in VAE models while ensuring computational efficiency, significantly improving the quality of UHD image restoration, and achieves state-of-the-art results in six UHD restoration tasks with only 1M parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12764v1-abstract-full').style.display = 'none'; document.getElementById('2503.12764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12759">arXiv:2503.12759</a> <span> [<a href="https://arxiv.org/pdf/2503.12759">pdf</a>, <a href="https://arxiv.org/format/2503.12759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RAG-RL: Advancing Retrieval-Augmented Generation via RL and Curriculum Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jerry Huang</a>, <a href="/search/cs?searchtype=author&query=Madala%2C+S">Siddarth Madala</a>, <a href="/search/cs?searchtype=author&query=Sidhu%2C+R">Risham Sidhu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+C">Cheng Niu</a>, <a href="/search/cs?searchtype=author&query=Hockenmaier%2C+J">Julia Hockenmaier</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12759v1-abstract-short" style="display: inline;"> Recent research highlights the challenges retrieval models face in retrieving useful contexts and the limitations of generation models in effectively utilizing those contexts in retrieval-augmented generation (RAG) settings. To address these challenges, we introduce RAG-RL, the first reasoning language model (RLM) specifically trained for RAG. RAG-RL demonstrates that stronger answer generation mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12759v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12759v1-abstract-full" style="display: none;"> Recent research highlights the challenges retrieval models face in retrieving useful contexts and the limitations of generation models in effectively utilizing those contexts in retrieval-augmented generation (RAG) settings. To address these challenges, we introduce RAG-RL, the first reasoning language model (RLM) specifically trained for RAG. RAG-RL demonstrates that stronger answer generation models can identify relevant contexts within larger sets of retrieved information -- thereby alleviating the burden on retrievers -- while also being able to utilize those contexts more effectively. Moreover, we show that curriculum design in the reinforcement learning (RL) post-training process is a powerful approach to enhancing model performance. We benchmark our method on two open-domain question-answering datasets and achieve state-of-the-art results, surpassing previous SOTA generative reader models. In addition, we offers empirical insights into various curriculum learning strategies, providing a deeper understanding of their impact on model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12759v1-abstract-full').style.display = 'none'; document.getElementById('2503.12759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 Pages, 3 Figures, Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12622">arXiv:2503.12622</a> <span> [<a href="https://arxiv.org/pdf/2503.12622">pdf</a>, <a href="https://arxiv.org/format/2503.12622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Cell Sorting with Scalable In Situ FPGA-Accelerated Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Islam%2C+K">Khayrul Islam</a>, <a href="/search/cs?searchtype=author&query=Forelli%2C+R+F">Ryan F. Forelli</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianzhong Han</a>, <a href="/search/cs?searchtype=author&query=Bhadane%2C+D">Deven Bhadane</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a>, <a href="/search/cs?searchtype=author&query=Agar%2C+J+C">Joshua C. Agar</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+N">Nhan Tran</a>, <a href="/search/cs?searchtype=author&query=Ogrenci%2C+S">Seda Ogrenci</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaling Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12622v1-abstract-short" style="display: inline;"> Precise cell classification is essential in biomedical diagnostics and therapeutic monitoring, particularly for identifying diverse cell types involved in various diseases. Traditional cell classification methods such as flow cytometry depend on molecular labeling which is often costly, time-intensive, and can alter cell integrity. To overcome these limitations, we present a label-free machine lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12622v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12622v1-abstract-full" style="display: none;"> Precise cell classification is essential in biomedical diagnostics and therapeutic monitoring, particularly for identifying diverse cell types involved in various diseases. Traditional cell classification methods such as flow cytometry depend on molecular labeling which is often costly, time-intensive, and can alter cell integrity. To overcome these limitations, we present a label-free machine learning framework for cell classification, designed for real-time sorting applications using bright-field microscopy images. This approach leverages a teacher-student model architecture enhanced by knowledge distillation, achieving high efficiency and scalability across different cell types. Demonstrated through a use case of classifying lymphocyte subsets, our framework accurately classifies T4, T8, and B cell types with a dataset of 80,000 preprocessed images, accessible via an open-source Python package for easy adaptation. Our teacher model attained 98\% accuracy in differentiating T4 cells from B cells and 93\% accuracy in zero-shot classification between T8 and B cells. Remarkably, our student model operates with only 0.02\% of the teacher model's parameters, enabling field-programmable gate array (FPGA) deployment. Our FPGA-accelerated student model achieves an ultra-low inference latency of just 14.5~$渭$s and a complete cell detection-to-sorting trigger time of 24.7~$渭$s, delivering 12x and 40x improvements over the previous state-of-the-art real-time cell analysis algorithm in inference and total latency, respectively, while preserving accuracy comparable to the teacher model. This framework provides a scalable, cost-effective solution for lymphocyte classification, as well as a new SOTA real-time cell sorting implementation for rapid identification of subsets using in situ deep learning on off-the-shelf computing hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12622v1-abstract-full').style.display = 'none'; document.getElementById('2503.12622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12383">arXiv:2503.12383</a> <span> [<a href="https://arxiv.org/pdf/2503.12383">pdf</a>, <a href="https://arxiv.org/format/2503.12383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VRsketch2Gaussian: 3D VR Sketch Guided 3D Object Generation with Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+S">Songen Gu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Haoxuan Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Binjie Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiyong Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12383v1-abstract-short" style="display: inline;"> We propose VRSketch2Gaussian, a first VR sketch-guided, multi-modal, native 3D object generation framework that incorporates a 3D Gaussian Splatting representation. As part of our work, we introduce VRSS, the first large-scale paired dataset containing VR sketches, text, images, and 3DGS, bridging the gap in multi-modal VR sketch-based generation. Our approach features the following key innovation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12383v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12383v1-abstract-full" style="display: none;"> We propose VRSketch2Gaussian, a first VR sketch-guided, multi-modal, native 3D object generation framework that incorporates a 3D Gaussian Splatting representation. As part of our work, we introduce VRSS, the first large-scale paired dataset containing VR sketches, text, images, and 3DGS, bridging the gap in multi-modal VR sketch-based generation. Our approach features the following key innovations: 1) Sketch-CLIP feature alignment. We propose a two-stage alignment strategy that bridges the domain gap between sparse VR sketch embeddings and rich CLIP embeddings, facilitating both VR sketch-based retrieval and generation tasks. 2) Fine-Grained multi-modal conditioning. We disentangle the 3D generation process by using explicit VR sketches for geometric conditioning and text descriptions for appearance control. To facilitate this, we propose a generalizable VR sketch encoder that effectively aligns different modalities. 3) Efficient and high-fidelity 3D native generation. Our method leverages a 3D-native generation approach that enables fast and texture-rich 3D object synthesis. Experiments conducted on our VRSS dataset demonstrate that our method achieves high-quality, multi-modal VR sketch-based 3D generation. We believe our VRSS dataset and VRsketch2Gaussian method will be beneficial for the 3D generation community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12383v1-abstract-full').style.display = 'none'; document.getElementById('2503.12383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11049">arXiv:2503.11049</a> <span> [<a href="https://arxiv.org/pdf/2503.11049">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Fish Mouth Inspired Origami Gripper for Robust Multi-Type Underwater Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Honghao Guo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junda Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+I">Ian Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Boyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xin Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunhui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianshu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11049v2-abstract-short" style="display: inline;"> Robotic grasping and manipulation in underwater environments present unique challenges for robotic hands traditionally used on land. These challenges stem from dynamic water conditions, a wide range of object properties from soft to stiff, irregular object shapes, and varying surface frictions. One common approach involves developing finger-based hands with embedded compliance using underactuation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11049v2-abstract-full').style.display = 'inline'; document.getElementById('2503.11049v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11049v2-abstract-full" style="display: none;"> Robotic grasping and manipulation in underwater environments present unique challenges for robotic hands traditionally used on land. These challenges stem from dynamic water conditions, a wide range of object properties from soft to stiff, irregular object shapes, and varying surface frictions. One common approach involves developing finger-based hands with embedded compliance using underactuation and soft actuators. This study introduces an effective alternative solution that does not rely on finger-based hand designs. We present a fish mouth inspired origami gripper that utilizes a single degree of freedom to perform a variety of robust grasping tasks underwater. The innovative structure transforms a simple uniaxial pulling motion into a grasping action based on the Yoshimura crease pattern folding. The origami gripper offers distinct advantages, including scalable and optimizable design, grasping compliance, and robustness, with four grasping types: pinch, power grasp, simultaneous grasping of multiple objects, and scooping from the seabed. In this work, we detail the design, modeling, fabrication, and validation of a specialized underwater gripper capable of handling various marine creatures, including jellyfish, crabs, and abalone. By leveraging an origami and bio-inspired approach, the presented gripper demonstrates promising potential for robotic grasping and manipulation in underwater environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11049v2-abstract-full').style.display = 'none'; document.getElementById('2503.11049v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10894">arXiv:2503.10894</a> <span> [<a href="https://arxiv.org/pdf/2503.10894">pdf</a>, <a href="https://arxiv.org/format/2503.10894">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HyperDAS: Towards Automating Mechanistic Interpretability with Hypernetworks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiuding Sun</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jing Huang</a>, <a href="/search/cs?searchtype=author&query=Baskaran%2C+S">Sidharth Baskaran</a>, <a href="/search/cs?searchtype=author&query=D%27Oosterlinck%2C+K">Karel D'Oosterlinck</a>, <a href="/search/cs?searchtype=author&query=Potts%2C+C">Christopher Potts</a>, <a href="/search/cs?searchtype=author&query=Sklar%2C+M">Michael Sklar</a>, <a href="/search/cs?searchtype=author&query=Geiger%2C+A">Atticus Geiger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10894v1-abstract-short" style="display: inline;"> Mechanistic interpretability has made great strides in identifying neural network features (e.g., directions in hidden activation space) that mediate concepts(e.g., the birth year of a person) and enable predictable manipulation. Distributed alignment search (DAS) leverages supervision from counterfactual data to learn concept features within hidden states, but DAS assumes we can afford to conduct… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10894v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10894v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10894v1-abstract-full" style="display: none;"> Mechanistic interpretability has made great strides in identifying neural network features (e.g., directions in hidden activation space) that mediate concepts(e.g., the birth year of a person) and enable predictable manipulation. Distributed alignment search (DAS) leverages supervision from counterfactual data to learn concept features within hidden states, but DAS assumes we can afford to conduct a brute force search over potential feature locations. To address this, we present HyperDAS, a transformer-based hypernetwork architecture that (1) automatically locates the token-positions of the residual stream that a concept is realized in and (2) constructs features of those residual stream vectors for the concept. In experiments with Llama3-8B, HyperDAS achieves state-of-the-art performance on the RAVEL benchmark for disentangling concepts in hidden states. In addition, we review the design decisions we made to mitigate the concern that HyperDAS (like all powerful interpretabilty methods) might inject new information into the target model rather than faithfully interpreting it. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10894v1-abstract-full').style.display = 'none'; document.getElementById('2503.10894v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10737">arXiv:2503.10737</a> <span> [<a href="https://arxiv.org/pdf/2503.10737">pdf</a>, <a href="https://arxiv.org/format/2503.10737">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Commenting Higher-level Code Unit: Full Code, Reduced Code, or Hierarchical Code Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weisong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiran Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jie Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhihui Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yonglong Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yebo Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiangping Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingya Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10737v1-abstract-short" style="display: inline;"> Commenting code is a crucial activity in software development, as it aids in facilitating future maintenance and updates. To enhance the efficiency of writing comments and reduce developers' workload, researchers has proposed various automated code summarization (ACS) techniques to automatically generate comments/summaries for given code units. However, these ACS techniques primarily focus on gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10737v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10737v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10737v1-abstract-full" style="display: none;"> Commenting code is a crucial activity in software development, as it aids in facilitating future maintenance and updates. To enhance the efficiency of writing comments and reduce developers' workload, researchers has proposed various automated code summarization (ACS) techniques to automatically generate comments/summaries for given code units. However, these ACS techniques primarily focus on generating summaries for code units at the method level. There is a significant lack of research on summarizing higher-level code units, such as file-level and module-level code units, despite the fact that summaries of these higher-level code units are highly useful for quickly gaining a macro-level understanding of software components and architecture. To fill this gap, in this paper, we conduct a systematic study on how to use LLMs for commenting higher-level code units, including file level and module level. These higher-level units are significantly larger than method-level ones, which poses challenges in handling long code inputs within LLM constraints and maintaining efficiency. To address these issues, we explore various summarization strategies for ACS of higher-level code units, which can be divided into three types: full code summarization, reduced code summarization, and hierarchical code summarization. The experimental results suggest that for summarizing file-level code units, using the full code is the most effective approach, with reduced code serving as a cost-efficient alternative. However, for summarizing module-level code units, hierarchical code summarization becomes the most promising strategy. In addition, inspired by the research on method-level ACS, we also investigate using the LLM as an evaluator to evaluate the quality of summaries of higher-level code units. The experimental results demonstrate that the LLM's evaluation results strongly correlate with human evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10737v1-abstract-full').style.display = 'none'; document.getElementById('2503.10737v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68-04 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.3; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10471">arXiv:2503.10471</a> <span> [<a href="https://arxiv.org/pdf/2503.10471">pdf</a>, <a href="https://arxiv.org/format/2503.10471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Siamese Foundation Models for Crystal Structure Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Liming Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+R">Rui Jiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianxing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yipeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fuchun Sun</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuxiang Ren</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jirong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10471v1-abstract-short" style="display: inline;"> Crystal Structure Prediction (CSP), which aims to generate stable crystal structures from compositions, represents a critical pathway for discovering novel materials. While structure prediction tasks in other domains, such as proteins, have seen remarkable progress, CSP remains a relatively underexplored area due to the more complex geometries inherent in crystal structures. In this paper, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10471v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10471v1-abstract-full" style="display: none;"> Crystal Structure Prediction (CSP), which aims to generate stable crystal structures from compositions, represents a critical pathway for discovering novel materials. While structure prediction tasks in other domains, such as proteins, have seen remarkable progress, CSP remains a relatively underexplored area due to the more complex geometries inherent in crystal structures. In this paper, we propose Siamese foundation models specifically designed to address CSP. Our pretrain-finetune framework, named DAO, comprises two complementary foundation models: DAO-G for structure generation and DAO-P for energy prediction. Experiments on CSP benchmarks (MP-20 and MPTS-52) demonstrate that our DAO-G significantly surpasses state-of-the-art (SOTA) methods across all metrics. Extensive ablation studies further confirm that DAO-G excels in generating diverse polymorphic structures, and the dataset relaxation and energy guidance provided by DAO-P are essential for enhancing DAO-G's performance. When applied to three real-world superconductors ($\text{CsV}_3\text{Sb}_5$, $ \text{Zr}_{16}\text{Rh}_8\text{O}_4$ and $\text{Zr}_{16}\text{Pd}_8\text{O}_4$) that are known to be challenging to analyze, our foundation models achieve accurate critical temperature predictions and structure generations. For instance, on $\text{CsV}_3\text{Sb}_5$, DAO-G generates a structure close to the experimental one with an RMSE of 0.0085; DAO-P predicts the $T_c$ value with high accuracy (2.26 K vs. the ground-truth value of 2.30 K). In contrast, conventional DFT calculators like Quantum Espresso only successfully derive the structure of the first superconductor within an acceptable time, while the RMSE is nearly 8 times larger, and the computation speed is more than 1000 times slower. These compelling results collectively highlight the potential of our approach for advancing materials science research and development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10471v1-abstract-full').style.display = 'none'; document.getElementById('2503.10471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10118">arXiv:2503.10118</a> <span> [<a href="https://arxiv.org/pdf/2503.10118">pdf</a>, <a href="https://arxiv.org/format/2503.10118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Real-Sim-Real (RSR) Loop Framework for Generalizable Robotic Policy Transfer with Differentiable Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lu Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuxuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinhao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yufei Jia</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zike Yan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+W">Weibin Gu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guyue Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10118v2-abstract-short" style="display: inline;"> The sim-to-real gap remains a critical challenge in robotics, hindering the deployment of algorithms trained in simulation to real-world systems. This paper introduces a novel Real-Sim-Real (RSR) loop framework leveraging differentiable simulation to address this gap by iteratively refining simulation parameters, aligning them with real-world conditions, and enabling robust and efficient policy tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10118v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10118v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10118v2-abstract-full" style="display: none;"> The sim-to-real gap remains a critical challenge in robotics, hindering the deployment of algorithms trained in simulation to real-world systems. This paper introduces a novel Real-Sim-Real (RSR) loop framework leveraging differentiable simulation to address this gap by iteratively refining simulation parameters, aligning them with real-world conditions, and enabling robust and efficient policy transfer. A key contribution of our work is the design of an informative cost function that encourages the collection of diverse and representative real-world data, minimizing bias and maximizing the utility of each data point for simulation refinement. This cost function integrates seamlessly into existing reinforcement learning algorithms (e.g., PPO, SAC) and ensures a balanced exploration of critical regions in the real domain. Furthermore, our approach is implemented on the versatile Mujoco MJX platform, and our framework is compatible with a wide range of robotic systems. Experimental results on several robotic manipulation tasks demonstrate that our method significantly reduces the sim-to-real gap, achieving high task performance and generalizability across diverse scenarios of both explicit and implicit environmental uncertainties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10118v2-abstract-full').style.display = 'none'; document.getElementById('2503.10118v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09474">arXiv:2503.09474</a> <span> [<a href="https://arxiv.org/pdf/2503.09474">pdf</a>, <a href="https://arxiv.org/format/2503.09474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SurgicalVLM-Agent: Towards an Interactive AI Co-Pilot for Pituitary Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiayuan Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Runlong He</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+D+Z">Danyal Z. Khan</a>, <a href="/search/cs?searchtype=author&query=Mazomenos%2C+E">Evangelos Mazomenos</a>, <a href="/search/cs?searchtype=author&query=Stoyanov%2C+D">Danail Stoyanov</a>, <a href="/search/cs?searchtype=author&query=Marcus%2C+H+J">Hani J. Marcus</a>, <a href="/search/cs?searchtype=author&query=Clarkson%2C+M+J">Matthew J. Clarkson</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M">Mobarakol Islam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09474v1-abstract-short" style="display: inline;"> Image-guided surgery demands adaptive, real-time decision support, yet static AI models struggle with structured task planning and providing interactive guidance. Large vision-language models (VLMs) offer a promising solution by enabling dynamic task planning and predictive decision support. We introduce SurgicalVLM-Agent, an AI co-pilot for image-guided pituitary surgery, capable of conversation,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09474v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09474v1-abstract-full" style="display: none;"> Image-guided surgery demands adaptive, real-time decision support, yet static AI models struggle with structured task planning and providing interactive guidance. Large vision-language models (VLMs) offer a promising solution by enabling dynamic task planning and predictive decision support. We introduce SurgicalVLM-Agent, an AI co-pilot for image-guided pituitary surgery, capable of conversation, planning, and task execution. The agent dynamically processes surgeon queries and plans the tasks such as MRI tumor segmentation, endoscope anatomy segmentation, overlaying preoperative imaging with intraoperative views, instrument tracking, and surgical visual question answering (VQA). To enable structured task planning, we develop the PitAgent dataset, a surgical context-aware dataset covering segmentation, overlaying, instrument localization, tool tracking, tool-tissue interactions, phase identification, and surgical activity recognition. Additionally, we propose FFT-GaLore, a fast Fourier transform (FFT)-based gradient projection technique for efficient low-rank adaptation, optimizing fine-tuning for LLaMA 3.2 in surgical environments. We validate SurgicalVLM-Agent by assessing task planning and prompt generation on our PitAgent dataset and evaluating zero-shot VQA using a public pituitary dataset. Results demonstrate state-of-the-art performance in task planning and query interpretation, with highly semantically meaningful VQA responses, advancing AI-driven surgical assistance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09474v1-abstract-full').style.display = 'none'; document.getElementById('2503.09474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09382">arXiv:2503.09382</a> <span> [<a href="https://arxiv.org/pdf/2503.09382">pdf</a>, <a href="https://arxiv.org/format/2503.09382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Next-Generation Recommender Systems: A Benchmark for Personalized Recommendation Assistant with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiani Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shijie Wang</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+L">Liang-bo Ning</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenqi Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuaiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dawei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09382v1-abstract-short" style="display: inline;"> Recommender systems (RecSys) are widely used across various modern digital platforms and have garnered significant attention. Traditional recommender systems usually focus only on fixed and simple recommendation scenarios, making it difficult to generalize to new and unseen recommendation tasks in an interactive paradigm. Recently, the advancement of large language models (LLMs) has revolutionized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09382v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09382v1-abstract-full" style="display: none;"> Recommender systems (RecSys) are widely used across various modern digital platforms and have garnered significant attention. Traditional recommender systems usually focus only on fixed and simple recommendation scenarios, making it difficult to generalize to new and unseen recommendation tasks in an interactive paradigm. Recently, the advancement of large language models (LLMs) has revolutionized the foundational architecture of RecSys, driving their evolution into more intelligent and interactive personalized recommendation assistants. However, most existing studies rely on fixed task-specific prompt templates to generate recommendations and evaluate the performance of personalized assistants, which limits the comprehensive assessments of their capabilities. This is because commonly used datasets lack high-quality textual user queries that reflect real-world recommendation scenarios, making them unsuitable for evaluating LLM-based personalized recommendation assistants. To address this gap, we introduce RecBench+, a new dataset benchmark designed to access LLMs' ability to handle intricate user recommendation needs in the era of LLMs. RecBench+ encompasses a diverse set of queries that span both hard conditions and soft preferences, with varying difficulty levels. We evaluated commonly used LLMs on RecBench+ and uncovered below findings: 1) LLMs demonstrate preliminary abilities to act as recommendation assistants, 2) LLMs are better at handling queries with explicitly stated conditions, while facing challenges with queries that require reasoning or contain misleading information. Our dataset has been released at https://github.com/jiani-huang/RecBench.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09382v1-abstract-full').style.display = 'none'; document.getElementById('2503.09382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09309">arXiv:2503.09309</a> <span> [<a href="https://arxiv.org/pdf/2503.09309">pdf</a>, <a href="https://arxiv.org/ps/2503.09309">ps</a>, <a href="https://arxiv.org/format/2503.09309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Steering No-Regret Agents in MFGs under Model Uncertainty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Widmer%2C+L">Leo Widmer</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+N">Niao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09309v1-abstract-short" style="display: inline;"> Incentive design is a popular framework for guiding agents' learning dynamics towards desired outcomes by providing additional payments beyond intrinsic rewards. However, most existing works focus on a finite, small set of agents or assume complete knowledge of the game, limiting their applicability to real-world scenarios involving large populations and model uncertainty. To address this gap, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09309v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09309v1-abstract-full" style="display: none;"> Incentive design is a popular framework for guiding agents' learning dynamics towards desired outcomes by providing additional payments beyond intrinsic rewards. However, most existing works focus on a finite, small set of agents or assume complete knowledge of the game, limiting their applicability to real-world scenarios involving large populations and model uncertainty. To address this gap, we study the design of steering rewards in Mean-Field Games (MFGs) with density-independent transitions, where both the transition dynamics and intrinsic reward functions are unknown. This setting presents non-trivial challenges, as the mediator must incentivize the agents to explore for its model learning under uncertainty, while simultaneously steer them to converge to desired behaviors without incurring excessive incentive payments. Assuming agents exhibit no(-adaptive) regret behaviors, we contribute novel optimistic exploration algorithms. Theoretically, we establish sub-linear regret guarantees for the cumulative gaps between the agents' behaviors and the desired ones. In terms of the steering cost, we demonstrate that our total incentive payments incur only sub-linear excess, competing with a baseline steering strategy that stabilizes the target policy as an equilibrium. Our work presents an effective framework for steering agents behaviors in large-population systems under uncertainty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09309v1-abstract-full').style.display = 'none'; document.getElementById('2503.09309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2025; 34 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08689">arXiv:2503.08689</a> <span> [<a href="https://arxiv.org/pdf/2503.08689">pdf</a>, <a href="https://arxiv.org/format/2503.08689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> QuoTA: Query-oriented Token Assignment via CoT Query Decouple for Long Video Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yongdong Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wang Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiawu Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weizhong Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shukang Yin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haojia Lin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiayi Ji</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08689v1-abstract-short" style="display: inline;"> Recent advances in long video understanding typically mitigate visual redundancy through visual token pruning based on attention distribution. However, while existing methods employ post-hoc low-response token pruning in decoder layers, they overlook the input-level semantic correlation between visual tokens and instructions (query). In this paper, we propose QuoTA, an ante-hoc training-free modul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08689v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08689v1-abstract-full" style="display: none;"> Recent advances in long video understanding typically mitigate visual redundancy through visual token pruning based on attention distribution. However, while existing methods employ post-hoc low-response token pruning in decoder layers, they overlook the input-level semantic correlation between visual tokens and instructions (query). In this paper, we propose QuoTA, an ante-hoc training-free modular that extends existing large video-language models (LVLMs) for visual token assignment based on query-oriented frame-level importance assessment. The query-oriented token selection is crucial as it aligns visual processing with task-specific requirements, optimizing token budget utilization while preserving semantically relevant content. Specifically, (i) QuoTA strategically allocates frame-level importance scores based on query relevance, enabling one-time visual token assignment before cross-modal interactions in decoder layers, (ii) we decouple the query through Chain-of-Thoughts reasoning to facilitate more precise LVLM-based frame importance scoring, and (iii) QuoTA offers a plug-and-play functionality that extends to existing LVLMs. Extensive experimental results demonstrate that implementing QuoTA with LLaVA-Video-7B yields an average performance improvement of 3.2% across six benchmarks (including Video-MME and MLVU) while operating within an identical visual token budget as the baseline. Codes are open-sourced at https://github.com/MAC-AutoML/QuoTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08689v1-abstract-full').style.display = 'none'; document.getElementById('2503.08689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://github.com/MAC-AutoML/QuoTA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08372">arXiv:2503.08372</a> <span> [<a href="https://arxiv.org/pdf/2503.08372">pdf</a>, <a href="https://arxiv.org/format/2503.08372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MetaFold: Language-Guided Multi-Category Garment Folding Framework via Trajectory Generation and Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junxiao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruihai Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yiwen Hou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhixuan Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingxiang Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chongkai Gao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenyu Wei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shensi Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaqi Huang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+L">Lin Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08372v1-abstract-short" style="display: inline;"> Garment folding is a common yet challenging task in robotic manipulation. The deformability of garments leads to a vast state space and complex dynamics, which complicates precise and fine-grained manipulation. Previous approaches often rely on predefined key points or demonstrations, limiting their generalization across diverse garment categories. This paper presents a framework, MetaFold, that d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08372v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08372v1-abstract-full" style="display: none;"> Garment folding is a common yet challenging task in robotic manipulation. The deformability of garments leads to a vast state space and complex dynamics, which complicates precise and fine-grained manipulation. Previous approaches often rely on predefined key points or demonstrations, limiting their generalization across diverse garment categories. This paper presents a framework, MetaFold, that disentangles task planning from action prediction, learning each independently to enhance model generalization. It employs language-guided point cloud trajectory generation for task planning and a low-level foundation model for action prediction. This structure facilitates multi-category learning, enabling the model to adapt flexibly to various user instructions and folding tasks. Experimental results demonstrate the superiority of our proposed framework. Supplementary materials are available on our website: https://meta-fold.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08372v1-abstract-full').style.display = 'none'; document.getElementById('2503.08372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08111">arXiv:2503.08111</a> <span> [<a href="https://arxiv.org/pdf/2503.08111">pdf</a>, <a href="https://arxiv.org/format/2503.08111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaRI: Material Retrieval Integration across Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhifei Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huixiong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08111v1-abstract-short" style="display: inline;"> Accurate material retrieval is critical for creating realistic 3D assets. Existing methods rely on datasets that capture shape-invariant and lighting-varied representations of materials, which are scarce and face challenges due to limited diversity and inadequate real-world generalization. Most current approaches adopt traditional image search techniques. They fall short in capturing the unique pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08111v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08111v1-abstract-full" style="display: none;"> Accurate material retrieval is critical for creating realistic 3D assets. Existing methods rely on datasets that capture shape-invariant and lighting-varied representations of materials, which are scarce and face challenges due to limited diversity and inadequate real-world generalization. Most current approaches adopt traditional image search techniques. They fall short in capturing the unique properties of material spaces, leading to suboptimal performance in retrieval tasks. Addressing these challenges, we introduce MaRI, a framework designed to bridge the feature space gap between synthetic and real-world materials. MaRI constructs a shared embedding space that harmonizes visual and material attributes through a contrastive learning strategy by jointly training an image and a material encoder, bringing similar materials and images closer while separating dissimilar pairs within the feature space. To support this, we construct a comprehensive dataset comprising high-quality synthetic materials rendered with controlled shape variations and diverse lighting conditions, along with real-world materials processed and standardized using material transfer techniques. Extensive experiments demonstrate the superior performance, accuracy, and generalization capabilities of MaRI across diverse and complex material retrieval tasks, outperforming existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08111v1-abstract-full').style.display = 'none'; document.getElementById('2503.08111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07575">arXiv:2503.07575</a> <span> [<a href="https://arxiv.org/pdf/2503.07575">pdf</a>, <a href="https://arxiv.org/format/2503.07575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VisBias: Measuring Explicit and Implicit Social Biases in Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-tse Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">Jiantong Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianping Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Youliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jieyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07575v1-abstract-short" style="display: inline;"> This research investigates both explicit and implicit social biases exhibited by Vision-Language Models (VLMs). The key distinction between these bias types lies in the level of awareness: explicit bias refers to conscious, intentional biases, while implicit bias operates subconsciously. To analyze explicit bias, we directly pose questions to VLMs related to gender and racial differences: (1) Mult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07575v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07575v1-abstract-full" style="display: none;"> This research investigates both explicit and implicit social biases exhibited by Vision-Language Models (VLMs). The key distinction between these bias types lies in the level of awareness: explicit bias refers to conscious, intentional biases, while implicit bias operates subconsciously. To analyze explicit bias, we directly pose questions to VLMs related to gender and racial differences: (1) Multiple-choice questions based on a given image (e.g., "What is the education level of the person in the image?") (2) Yes-No comparisons using two images (e.g., "Is the person in the first image more educated than the person in the second image?") For implicit bias, we design tasks where VLMs assist users but reveal biases through their responses: (1) Image description tasks: Models are asked to describe individuals in images, and we analyze disparities in textual cues across demographic groups. (2) Form completion tasks: Models draft a personal information collection form with 20 attributes, and we examine correlations among selected attributes for potential biases. We evaluate Gemini-1.5, GPT-4V, GPT-4o, LLaMA-3.2-Vision and LLaVA-v1.6. Our code and data are publicly available at https://github.com/uscnlp-lime/VisBias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07575v1-abstract-full').style.display = 'none'; document.getElementById('2503.07575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06659">arXiv:2503.06659</a> <span> [<a href="https://arxiv.org/pdf/2503.06659">pdf</a>, <a href="https://arxiv.org/format/2503.06659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> PANDA: Parkinson's Assistance and Notification Driving Aid </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+T">Tianyang Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xucheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhirong Wan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jing Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yicheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+N">Ning Su</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaolan Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wei Sun</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F+M">Franklin Mingzhe Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06659v1-abstract-short" style="display: inline;"> Parkinson's Disease (PD) significantly impacts driving abilities, often leading to early driving cessation or accidents due to reduced motor control and increasing reaction times. To diminish the impact of these symptoms, we developed PANDA (Parkinson's Assistance and Notification Driving Aid), a multi-modality real-time alert system designed to monitor driving patterns continuously and provide im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06659v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06659v1-abstract-full" style="display: none;"> Parkinson's Disease (PD) significantly impacts driving abilities, often leading to early driving cessation or accidents due to reduced motor control and increasing reaction times. To diminish the impact of these symptoms, we developed PANDA (Parkinson's Assistance and Notification Driving Aid), a multi-modality real-time alert system designed to monitor driving patterns continuously and provide immediate alerts for irregular driving behaviors, enhancing driver safety of individuals with PD. The system was developed through a participatory design process with 9 people with PD and 13 non-PD individuals using a driving simulator, which allowed us to identify critical design characteristics and collect detailed data on driving behavior. A user study involving individuals with PD evaluated the effectiveness of PANDA, exploring optimal strategies for delivering alerts and ensuring they are timely and helpful. Our findings demonstrate that PANDA has the potential to enhance the driving safety of individuals with PD, offering a valuable tool for maintaining independence and confidence behind the wheel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06659v1-abstract-full').style.display = 'none'; document.getElementById('2503.06659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06237">arXiv:2503.06237</a> <span> [<a href="https://arxiv.org/pdf/2503.06237">pdf</a>, <a href="https://arxiv.org/format/2503.06237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Lanes and Points in Complex Scenarios for Monocular 3D Lane Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yifan Chang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofeng Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yun Ye</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhujin Liang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yi Shan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+D">Dalong Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06237v1-abstract-short" style="display: inline;"> Monocular 3D lane detection is a fundamental task in autonomous driving. Although sparse-point methods lower computational load and maintain high accuracy in complex lane geometries, current methods fail to fully leverage the geometric structure of lanes in both lane geometry representations and model design. In lane geometry representations, we present a theoretical analysis alongside experimenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06237v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06237v1-abstract-full" style="display: none;"> Monocular 3D lane detection is a fundamental task in autonomous driving. Although sparse-point methods lower computational load and maintain high accuracy in complex lane geometries, current methods fail to fully leverage the geometric structure of lanes in both lane geometry representations and model design. In lane geometry representations, we present a theoretical analysis alongside experimental validation to verify that current sparse lane representation methods contain inherent flaws, resulting in potential errors of up to 20 m, which raise significant safety concerns for driving. To address this issue, we propose a novel patching strategy to completely represent the full lane structure. To enable existing models to match this strategy, we introduce the EndPoint head (EP-head), which adds a patching distance to endpoints. The EP-head enables the model to predict more complete lane representations even with fewer preset points, effectively addressing existing limitations and paving the way for models that are faster and require fewer parameters in the future. In model design, to enhance the model's perception of lane structures, we propose the PointLane attention (PL-attention), which incorporates prior geometric knowledge into the attention mechanism. Extensive experiments demonstrate the effectiveness of the proposed methods on various state-of-the-art models. For instance, in terms of the overall F1-score, our methods improve Persformer by 4.4 points, Anchor3DLane by 3.2 points, and LATR by 2.8 points. The code will be available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06237v1-abstract-full').style.display = 'none'; document.getElementById('2503.06237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06075">arXiv:2503.06075</a> <span> [<a href="https://arxiv.org/pdf/2503.06075">pdf</a>, <a href="https://arxiv.org/format/2503.06075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FSDP: Fast and Safe Data-Driven Overtaking Trajectory Planning for Head-to-Head Autonomous Racing Competitions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Cheng Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jihao Huang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Wule Mao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yonghao Fu</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+X">Xuemin Chi</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haotong Qin</a>, <a href="/search/cs?searchtype=author&query=Baumann%2C+N">Nicolas Baumann</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhitao Liu</a>, <a href="/search/cs?searchtype=author&query=Magno%2C+M">Michele Magno</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06075v1-abstract-short" style="display: inline;"> Generating overtaking trajectories in autonomous racing is a challenging task, as the trajectory must satisfy the vehicle's dynamics and ensure safety and real-time performance running on resource-constrained hardware. This work proposes the Fast and Safe Data-Driven Planner to address this challenge. Sparse Gaussian predictions are introduced to improve both the computational efficiency and accur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06075v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06075v1-abstract-full" style="display: none;"> Generating overtaking trajectories in autonomous racing is a challenging task, as the trajectory must satisfy the vehicle's dynamics and ensure safety and real-time performance running on resource-constrained hardware. This work proposes the Fast and Safe Data-Driven Planner to address this challenge. Sparse Gaussian predictions are introduced to improve both the computational efficiency and accuracy of opponent predictions. Furthermore, the proposed approach employs a bi-level quadratic programming framework to generate an overtaking trajectory leveraging the opponent predictions. The first level uses polynomial fitting to generate a rough trajectory, from which reference states and control inputs are derived for the second level. The second level formulates a model predictive control optimization problem in the Frenet frame, generating a trajectory that satisfies both kinematic feasibility and safety. Experimental results on the F1TENTH platform show that our method outperforms the State-of-the-Art, achieving an 8.93% higher overtaking success rate, allowing the maximum opponent speed, ensuring a smoother ego trajectory, and reducing 74.04% computational time compared to the Predictive Spliner method. The code is available at: https://github.com/ZJU-DDRX/FSDP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06075v1-abstract-full').style.display = 'none'; document.getElementById('2503.06075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to IROS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05682">arXiv:2503.05682</a> <span> [<a href="https://arxiv.org/pdf/2503.05682">pdf</a>, <a href="https://arxiv.org/format/2503.05682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Task-oriented Uncertainty Collaborative Learning for Label-Efficient Brain Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hongjie Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Baihong Xie</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifan Gao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Junxian Du</a>, <a href="/search/cs?searchtype=author&query=Lally%2C+P">Pete Lally</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05682v1-abstract-short" style="display: inline;"> Multi-contrast magnetic resonance imaging (MRI) plays a vital role in brain tumor segmentation and diagnosis by leveraging complementary information from different contrasts. Each contrast highlights specific tumor characteristics, enabling a comprehensive understanding of tumor morphology, edema, and pathological heterogeneity. However, existing methods still face the challenges of multi-level sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05682v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05682v1-abstract-full" style="display: none;"> Multi-contrast magnetic resonance imaging (MRI) plays a vital role in brain tumor segmentation and diagnosis by leveraging complementary information from different contrasts. Each contrast highlights specific tumor characteristics, enabling a comprehensive understanding of tumor morphology, edema, and pathological heterogeneity. However, existing methods still face the challenges of multi-level specificity perception across different contrasts, especially with limited annotations. These challenges include data heterogeneity, granularity differences, and interference from redundant information. To address these limitations, we propose a Task-oriented Uncertainty Collaborative Learning (TUCL) framework for multi-contrast MRI segmentation. TUCL introduces a task-oriented prompt attention (TPA) module with intra-prompt and cross-prompt attention mechanisms to dynamically model feature interactions across contrasts and tasks. Additionally, a cyclic process is designed to map the predictions back to the prompt to ensure that the prompts are effectively utilized. In the decoding stage, the TUCL framework proposes a dual-path uncertainty refinement (DUR) strategy which ensures robust segmentation by refining predictions iteratively. Extensive experimental results on limited labeled data demonstrate that TUCL significantly improves segmentation accuracy (88.2\% in Dice and 10.853 mm in HD95). It shows that TUCL has the potential to extract multi-contrast information and reduce the reliance on extensive annotations. The code is available at: https://github.com/Zhenxuan-Zhang/TUCL_BrainSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05682v1-abstract-full').style.display = 'none'; document.getElementById('2503.05682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05347">arXiv:2503.05347</a> <span> [<a href="https://arxiv.org/pdf/2503.05347">pdf</a>, <a href="https://arxiv.org/format/2503.05347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> GEMA-Score: Granular Explainable Multi-Agent Score for Radiology Report Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kinhei Lee</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Weihang Deng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huichi Zhou</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zihao Jin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifan Gao</a>, <a href="/search/cs?searchtype=author&query=Marshall%2C+D+C">Dominic C Marshall</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yingying Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05347v1-abstract-short" style="display: inline;"> Automatic medical report generation supports clinical diagnosis, reduces the workload of radiologists, and holds the promise of improving diagnosis consistency. However, existing evaluation metrics primarily assess the accuracy of key medical information coverage in generated reports compared to human-written reports, while overlooking crucial details such as the location and certainty of reported… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05347v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05347v1-abstract-full" style="display: none;"> Automatic medical report generation supports clinical diagnosis, reduces the workload of radiologists, and holds the promise of improving diagnosis consistency. However, existing evaluation metrics primarily assess the accuracy of key medical information coverage in generated reports compared to human-written reports, while overlooking crucial details such as the location and certainty of reported abnormalities. These limitations hinder the comprehensive assessment of the reliability of generated reports and pose risks in their selection for clinical use. Therefore, we propose a Granular Explainable Multi-Agent Score (GEMA-Score) in this paper, which conducts both objective quantification and subjective evaluation through a large language model-based multi-agent workflow. Our GEMA-Score parses structured reports and employs NER-F1 calculations through interactive exchanges of information among agents to assess disease diagnosis, location, severity, and uncertainty. Additionally, an LLM-based scoring agent evaluates completeness, readability, and clinical terminology while providing explanatory feedback. Extensive experiments validate that GEMA-Score achieves the highest correlation with human expert evaluations on a public dataset, demonstrating its effectiveness in clinical scoring (Kendall coefficient = 0.70 for Rexval dataset and Kendall coefficient = 0.54 for RadEvalX dataset). The anonymous project demo is available at: https://github.com/Zhenxuan-Zhang/GEMA_score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05347v1-abstract-full').style.display = 'none'; document.getElementById('2503.05347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05339">arXiv:2503.05339</a> <span> [<a href="https://arxiv.org/pdf/2503.05339">pdf</a>, <a href="https://arxiv.org/format/2503.05339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pretext Task Adversarial Learning for Unpaired Low-field to Ultra High-field MRI Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+P">Peiyuan Jing</a>, <a href="/search/cs?searchtype=author&query=Beitone%2C+C">Coraline Beitone</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifan Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Lally%2C+P">Pete Lally</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05339v1-abstract-short" style="display: inline;"> Given the scarcity and cost of high-field MRI, the synthesis of high-field MRI from low-field MRI holds significant potential when there is limited data for training downstream tasks (e.g. segmentation). Low-field MRI often suffers from a reduced signal-to-noise ratio (SNR) and spatial resolution compared to high-field MRI. However, synthesizing high-field MRI data presents challenges. These invol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05339v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05339v1-abstract-full" style="display: none;"> Given the scarcity and cost of high-field MRI, the synthesis of high-field MRI from low-field MRI holds significant potential when there is limited data for training downstream tasks (e.g. segmentation). Low-field MRI often suffers from a reduced signal-to-noise ratio (SNR) and spatial resolution compared to high-field MRI. However, synthesizing high-field MRI data presents challenges. These involve aligning image features across domains while preserving anatomical accuracy and enhancing fine details. To address these challenges, we propose a Pretext Task Adversarial (PTA) learning framework for high-field MRI synthesis from low-field MRI data. The framework comprises three processes: (1) The slice-wise gap perception (SGP) network aligns the slice inconsistencies of low-field and high-field datasets based on contrastive learning. (2) The local structure correction (LSC) network extracts local structures by restoring the locally rotated and masked images. (3) The pretext task-guided adversarial training process introduces additional supervision and incorporates a discriminator to improve image realism. Extensive experiments on low-field to ultra high-field task demonstrate the effectiveness of our method, achieving state-of-the-art performance (16.892 in FID, 1.933 in IS, and 0.324 in MS-SSIM). This enables the generation of high-quality high-field-like MRI data from low-field MRI data to augment training datasets for downstream tasks. The code is available at: https://github.com/Zhenxuan-Zhang/PTA4Unpaired_HF_MRI_SYN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05339v1-abstract-full').style.display = 'none'; document.getElementById('2503.05339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05130">arXiv:2503.05130</a> <span> [<a href="https://arxiv.org/pdf/2503.05130">pdf</a>, <a href="https://arxiv.org/format/2503.05130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Dilu: Enabling GPU Resourcing-on-Demand for Serverless DL Serving via Introspective Elasticity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+C">Cunchi Lv</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiao Shi</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhengyu Lei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinyue Huang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wenting Tan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaohui Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaofang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05130v1-abstract-short" style="display: inline;"> Serverless computing, with its ease of management, auto-scaling, and cost-effectiveness, is widely adopted by deep learning (DL) applications. DL workloads, especially with large language models, require substantial GPU resources to ensure QoS. However, it is prone to produce GPU fragments (e.g., 15\%-94\%) in serverless DL systems due to the dynamicity of workloads and coarse-grained static GPU a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05130v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05130v1-abstract-full" style="display: none;"> Serverless computing, with its ease of management, auto-scaling, and cost-effectiveness, is widely adopted by deep learning (DL) applications. DL workloads, especially with large language models, require substantial GPU resources to ensure QoS. However, it is prone to produce GPU fragments (e.g., 15\%-94\%) in serverless DL systems due to the dynamicity of workloads and coarse-grained static GPU allocation mechanisms, gradually eroding the profits offered by serverless elasticity. Different from classical serverless systems that only scale horizontally, we present introspective elasticity (IE), a fine-grained and adaptive two-dimensional co-scaling mechanism to support GPU resourcing-on-demand for serverless DL tasks. Based on this insight, we build Dilu, a cross-layer and GPU-based serverless DL system with IE support. First, Dilu provides multi-factor profiling for DL tasks with efficient pruning search methods. Second, Dilu adheres to the resourcing-complementary principles in scheduling to improve GPU utilization with QoS guarantees. Third, Dilu adopts an adaptive 2D co-scaling method to enhance the elasticity of GPU provisioning in real time. Evaluations show that it can dynamically adjust the resourcing of various DL functions with low GPU fragmentation (10\%-46\% GPU defragmentation), high throughput (up to 1.8$\times$ inference and 1.1$\times$ training throughput increment) and QoS guarantees (11\%-71\% violation rate reduction), compared to the SOTA baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05130v1-abstract-full').style.display = 'none'; document.getElementById('2503.05130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05118">arXiv:2503.05118</a> <span> [<a href="https://arxiv.org/pdf/2503.05118">pdf</a>, <a href="https://arxiv.org/format/2503.05118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SMILENet: Unleashing Extra-Large Capacity Image Steganography via a Synergistic Mosaic InvertibLE Hiding Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jun-Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zihan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianrui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wentao Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xin Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinwang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Dragotti%2C+P+L">Pier Luigi Dragotti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05118v1-abstract-short" style="display: inline;"> Existing image steganography methods face fundamental limitations in hiding capacity (typically $1\sim7$ images) due to severe information interference and uncoordinated capacity-distortion trade-off. We propose SMILENet, a novel synergistic framework that achieves 25 image hiding through three key innovations: (i) A synergistic network architecture coordinates reversible and non-reversible operat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05118v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05118v1-abstract-full" style="display: none;"> Existing image steganography methods face fundamental limitations in hiding capacity (typically $1\sim7$ images) due to severe information interference and uncoordinated capacity-distortion trade-off. We propose SMILENet, a novel synergistic framework that achieves 25 image hiding through three key innovations: (i) A synergistic network architecture coordinates reversible and non-reversible operations to efficiently exploit information redundancy in both secret and cover images. The reversible Invertible Cover-Driven Mosaic (ICDM) module and Invertible Mosaic Secret Embedding (IMSE) module establish cover-guided mosaic transformations and representation embedding with mathematically guaranteed invertibility for distortion-free embedding. The non-reversible Secret Information Selection (SIS) module and Secret Detail Enhancement (SDE) module implement learnable feature modulation for critical information selection and enhancement. (ii) A unified training strategy that coordinates complementary modules to achieve 3.0x higher capacity than existing methods with superior visual quality. (iii) Last but not least, we introduce a new metric to model Capacity-Distortion Trade-off for evaluating the image steganography algorithms that jointly considers hiding capacity and distortion, and provides a unified evaluation approach for accessing results with different number of secret image. Extensive experiments on DIV2K, Paris StreetView and ImageNet1K show that SMILENet outperforms state-of-the-art methods in terms of hiding capacity, recovery quality as well as security against steganalysis methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05118v1-abstract-full').style.display = 'none'; document.getElementById('2503.05118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05066">arXiv:2503.05066</a> <span> [<a href="https://arxiv.org/pdf/2503.05066">pdf</a>, <a href="https://arxiv.org/format/2503.05066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Capacity-Aware Inference: Mitigating the Straggler Effect in Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+S">Shwai He</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weilin Cai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiayi Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05066v1-abstract-short" style="display: inline;"> The Mixture of Experts (MoE) is an effective architecture for scaling large language models by leveraging sparse expert activation, optimizing the trade-off between performance and efficiency. However, under expert parallelism, MoE suffers from inference inefficiencies due to imbalanced token-to-expert assignment, where some experts are overloaded while others remain underutilized. This imbalance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05066v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05066v1-abstract-full" style="display: none;"> The Mixture of Experts (MoE) is an effective architecture for scaling large language models by leveraging sparse expert activation, optimizing the trade-off between performance and efficiency. However, under expert parallelism, MoE suffers from inference inefficiencies due to imbalanced token-to-expert assignment, where some experts are overloaded while others remain underutilized. This imbalance leads to poor resource utilization and increased latency, as the most burdened expert dictates the overall delay, a phenomenon we define as the \textbf{\textit{Straggler Effect}}. To mitigate this, we propose Capacity-Aware Inference, including two key techniques: (1) \textbf{\textit{Capacity-Aware Token Drop}}, which discards overloaded tokens to regulate the maximum latency of MoE, and (2) \textbf{\textit{Capacity-Aware Token Reroute}}, which reallocates overflowed tokens to underutilized experts, balancing the token distribution. These techniques collectively optimize both high-load and low-load expert utilization, leading to a more efficient MoE inference pipeline. Extensive experiments demonstrate the effectiveness of our methods, showing significant improvements in inference efficiency, e.g., 0.2\% average performance increase and a 1.94$\times$ inference speedup on Mixtral-8$\times$7B-Instruct. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05066v1-abstract-full').style.display = 'none'; document.getElementById('2503.05066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05063">arXiv:2503.05063</a> <span> [<a href="https://arxiv.org/pdf/2503.05063">pdf</a>, <a href="https://arxiv.org/format/2503.05063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lightweight Hypercomplex MRI Reconstruction: A Generalized Kronecker-Parameterized Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haosen Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+C">Congren Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05063v2-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is crucial for clinical diagnostics but is hindered by prolonged scan times. Current deep learning models enhance MRI reconstruction but are often memory-intensive and unsuitable for resource-limited systems. This paper introduces a lightweight MRI reconstruction model leveraging Kronecker-Parameterized Hypercomplex Neural Networks to achieve high performance with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05063v2-abstract-full').style.display = 'inline'; document.getElementById('2503.05063v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05063v2-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is crucial for clinical diagnostics but is hindered by prolonged scan times. Current deep learning models enhance MRI reconstruction but are often memory-intensive and unsuitable for resource-limited systems. This paper introduces a lightweight MRI reconstruction model leveraging Kronecker-Parameterized Hypercomplex Neural Networks to achieve high performance with reduced parameters. By integrating Kronecker-based modules, including Kronecker MLP, Kronecker Window Attention, and Kronecker Convolution, the proposed model efficiently extracts spatial features while preserving representational power. We introduce Kronecker U-Net and Kronecker SwinMR, which maintain high reconstruction quality with approximately 50% fewer parameters compared to existing models. Experimental evaluation on the FastMRI dataset demonstrates competitive PSNR, SSIM, and LPIPS metrics, even at high acceleration factors (8x and 16x), with no significant performance drop. Additionally, Kronecker variants exhibit superior generalization and reduced overfitting on limited datasets, facilitating efficient MRI reconstruction on hardware-constrained systems. This approach sets a new benchmark for parameter-efficient medical imaging models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05063v2-abstract-full').style.display = 'none'; document.getElementById('2503.05063v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures. Submitted for publication</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05057">arXiv:2503.05057</a> <span> [<a href="https://arxiv.org/pdf/2503.05057">pdf</a>, <a href="https://arxiv.org/format/2503.05057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Prismatic-Bending Transformable (PBT) Joint for a Modular, Foldable Manipulator with Enhanced Reachability and Dexterity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianshu Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junda Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Boyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xin Ma</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05057v1-abstract-short" style="display: inline;"> Robotic manipulators, traditionally designed with classical joint-link articulated structures, excel in industrial applications but face challenges in human-centered and general-purpose tasks requiring greater dexterity and adaptability. Addressing these limitations, we introduce the Prismatic-Bending Transformable (PBT) Joint, a novel design inspired by the scissors mechanism, enabling transforma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05057v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05057v1-abstract-full" style="display: none;"> Robotic manipulators, traditionally designed with classical joint-link articulated structures, excel in industrial applications but face challenges in human-centered and general-purpose tasks requiring greater dexterity and adaptability. Addressing these limitations, we introduce the Prismatic-Bending Transformable (PBT) Joint, a novel design inspired by the scissors mechanism, enabling transformable kinematic chains. Each PBT joint module provides three degrees of freedom-bending, rotation, and elongation/contraction-allowing scalable and reconfigurable assemblies to form diverse kinematic configurations tailored to specific tasks. This innovative design surpasses conventional systems, delivering superior flexibility and performance across various applications. We present the design, modeling, and experimental validation of the PBT joint, demonstrating its integration into modular and foldable robotic arms. The PBT joint functions as a single SKU, enabling manipulators to be constructed entirely from standardized PBT joints without additional customized components. It also serves as a modular extension for existing systems, such as wrist modules, streamlining design, deployment, transportation, and maintenance. Three sizes-large, medium, and small-have been developed and integrated into robotic manipulators, highlighting their enhanced dexterity, reachability, and adaptability for manipulation tasks. This work represents a significant advancement in robotic design, offering scalable and efficient solutions for dynamic and unstructured environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05057v1-abstract-full').style.display = 'none'; document.getElementById('2503.05057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04645">arXiv:2503.04645</a> <span> [<a href="https://arxiv.org/pdf/2503.04645">pdf</a>, <a href="https://arxiv.org/format/2503.04645">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Ultra-Low-Latency Edge Intelligent Sensing: A Source-Channel Tradeoff and Its Application to Coding Rate Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qunsong Zeng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianhao Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhanwei Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaibin Huang</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+K+K">Kin K. Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04645v1-abstract-short" style="display: inline;"> The forthcoming sixth-generation (6G) mobile network is set to merge edge artificial intelligence (AI) and integrated sensing and communication (ISAC) extensively, giving rise to the new paradigm of edge intelligent sensing (EI-Sense). This paradigm leverages ubiquitous edge devices for environmental sensing and deploys AI algorithms at edge servers to interpret the observations via remote inferen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04645v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04645v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04645v1-abstract-full" style="display: none;"> The forthcoming sixth-generation (6G) mobile network is set to merge edge artificial intelligence (AI) and integrated sensing and communication (ISAC) extensively, giving rise to the new paradigm of edge intelligent sensing (EI-Sense). This paradigm leverages ubiquitous edge devices for environmental sensing and deploys AI algorithms at edge servers to interpret the observations via remote inference on wirelessly uploaded features. A significant challenge arises in designing EI-Sense systems for 6G mission-critical applications, which demand high performance under stringent latency constraints. To tackle this challenge, we focus on the end-to-end (E2E) performance of EI-Sense and characterize a source-channel tradeoff that balances source distortion and channel reliability. In this work, we establish a theoretical foundation for the source-channel tradeoff by quantifying the effects of source coding on feature discriminant gains and channel reliability on packet loss. Building on this foundation, we design the coding rate control by optimizing the tradeoff to minimize the E2E sensing error probability, leading to a low-complexity algorithm for ultra-low-latency EI-Sense. Finally, we validate our theoretical analysis and proposed coding rate control algorithm through extensive experiments on both synthetic and real datasets, demonstrating the sensing performance gain of our approach with respect to traditional reliability-centric methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04645v1-abstract-full').style.display = 'none'; document.getElementById('2503.04645v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04521">arXiv:2503.04521</a> <span> [<a href="https://arxiv.org/pdf/2503.04521">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Pricing for On-Demand DNN Inference in the Edge-AI Market </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Songyuan Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jia Hu</a>, <a href="/search/cs?searchtype=author&query=Min%2C+G">Geyong Min</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haojun Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiwei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04521v1-abstract-short" style="display: inline;"> The convergence of edge computing and AI gives rise to Edge-AI, which enables the deployment of real-time AI applications and services at the network edge. One of the fundamental research issues in Edge-AI is edge inference acceleration, which aims to realize low-latency high-accuracy DNN inference services by leveraging the fine-grained offloading of partitioned inference tasks from end devices t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04521v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04521v1-abstract-full" style="display: none;"> The convergence of edge computing and AI gives rise to Edge-AI, which enables the deployment of real-time AI applications and services at the network edge. One of the fundamental research issues in Edge-AI is edge inference acceleration, which aims to realize low-latency high-accuracy DNN inference services by leveraging the fine-grained offloading of partitioned inference tasks from end devices to edge servers. However, existing research has yet to adopt a practical Edge-AI market perspective, which would systematically explore the personalized inference needs of AI users (e.g., inference accuracy, latency, and task complexity), the revenue incentives for AI service providers that offer edge inference services, and multi-stakeholder governance within a market-oriented context. To bridge this gap, we propose an Auction-based Edge Inference Pricing Mechanism (AERIA) for revenue maximization to tackle the multi-dimensional optimization problem of DNN model partition, edge inference pricing, and resource allocation. We investigate the multi-exit device-edge synergistic inference scheme for on-demand DNN inference acceleration, and analyse the auction dynamics amongst the AI service providers, AI users and edge infrastructure provider. Owing to the strategic mechanism design via randomized consensus estimate and cost sharing techniques, the Edge-AI market attains several desirable properties, including competitiveness in revenue maximization, incentive compatibility, and envy-freeness, which are crucial to maintain the effectiveness, truthfulness, and fairness of our auction outcomes. The extensive simulation experiments based on four representative DNN inference workloads demonstrate that our AERIA mechanism significantly outperforms several state-of-the-art approaches in revenue maximization, demonstrating the efficacy of AERIA for on-demand DNN inference in the Edge-AI market. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04521v1-abstract-full').style.display = 'none'; document.getElementById('2503.04521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Index Terms: Edge-AI, DNN Inference Offloading, Resource Management, Dynamic Pricing, Auction Mechanism</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository