CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 4,365 results for author: <span class="mathjax">Wang, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Wang%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+L&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24235">arXiv:2503.24235</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.24235">pdf</a>, <a href="https://arxiv.org/format/2503.24235">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> What, How, Where, and How Well? A Survey on Test-Time Scaling in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+F">Fuyuan Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zexu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weixu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhihan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=King%2C+I">Irwin King</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xue Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24235v1-abstract-short" style="display: inline;"> As enthusiasm for scaling computation (data and parameters) in the pretraining era gradually diminished, test-time scaling (TTS), also referred to as ``test-time computing&#39;&#39; has emerged as a prominent research focus. Recent studies demonstrate that TTS can further elicit the problem-solving capabilities of large language models (LLMs), enabling significant breakthroughs not only in specialized rea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24235v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24235v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24235v1-abstract-full" style="display: none;"> As enthusiasm for scaling computation (data and parameters) in the pretraining era gradually diminished, test-time scaling (TTS), also referred to as ``test-time computing&#39;&#39; has emerged as a prominent research focus. Recent studies demonstrate that TTS can further elicit the problem-solving capabilities of large language models (LLMs), enabling significant breakthroughs not only in specialized reasoning tasks, such as mathematics and coding, but also in general tasks like open-ended Q&amp;A. However, despite the explosion of recent efforts in this area, there remains an urgent need for a comprehensive survey offering a systemic understanding. To fill this gap, we propose a unified, multidimensional framework structured along four core dimensions of TTS research: what to scale, how to scale, where to scale, and how well to scale. Building upon this taxonomy, we conduct an extensive review of methods, application scenarios, and assessment aspects, and present an organized decomposition that highlights the unique functional roles of individual techniques within the broader TTS landscape. From this analysis, we distill the major developmental trajectories of TTS to date and offer hands-on guidelines for practical deployment. Furthermore, we identify several open challenges and offer insights into promising future directions, including further scaling, clarifying the functional essence of techniques, generalizing to more tasks, and more attributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24235v1-abstract-full').style.display = 'none'; document.getElementById('2503.24235v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23726">arXiv:2503.23726</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.23726">pdf</a>, <a href="https://arxiv.org/format/2503.23726">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PDSL: Privacy-Preserved Decentralized Stochastic Learning with Heterogeneous Data Distribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lina Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yunsheng Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chunxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+F">Feng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23726v1-abstract-short" style="display: inline;"> In the paradigm of decentralized learning, a group of agents collaborates to learn a global model using distributed datasets without a central server. However, due to the heterogeneity of the local data across the different agents, learning a robust global model is rather challenging. Moreover, the collaboration of the agents relies on their gradient information exchange, which poses a risk of pri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23726v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23726v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23726v1-abstract-full" style="display: none;"> In the paradigm of decentralized learning, a group of agents collaborates to learn a global model using distributed datasets without a central server. However, due to the heterogeneity of the local data across the different agents, learning a robust global model is rather challenging. Moreover, the collaboration of the agents relies on their gradient information exchange, which poses a risk of privacy leakage. In this paper, to address these issues, we propose PDSL, a novel privacy-preserved decentralized stochastic learning algorithm with heterogeneous data distribution. On one hand, we innovate in utilizing the notion of Shapley values such that each agent can precisely measure the contributions of its heterogeneous neighbors to the global learning goal; on the other hand, we leverage the notion of differential privacy to prevent each agent from suffering privacy leakage when it contributes gradient information to its neighbors. We conduct both solid theoretical analysis and extensive experiments to demonstrate the efficacy of our PDSL algorithm in terms of privacy preservation and convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23726v1-abstract-full').style.display = 'none'; document.getElementById('2503.23726v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23712">arXiv:2503.23712</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.23712">pdf</a>, <a href="https://arxiv.org/format/2503.23712">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ElimPCL: Eliminating Noise Accumulation with Progressive Curriculum Labeling for Source-Free Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Hao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+M">Meiguang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23712v1-abstract-short" style="display: inline;"> Source-Free Domain Adaptation (SFDA) aims to train a target model without source data, and the key is to generate pseudo-labels using a pre-trained source model. However, we observe that the source model often produces highly uncertain pseudo-labels for hard samples, particularly those heavily affected by domain shifts, leading to these noisy pseudo-labels being introduced even before adaptation a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23712v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23712v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23712v1-abstract-full" style="display: none;"> Source-Free Domain Adaptation (SFDA) aims to train a target model without source data, and the key is to generate pseudo-labels using a pre-trained source model. However, we observe that the source model often produces highly uncertain pseudo-labels for hard samples, particularly those heavily affected by domain shifts, leading to these noisy pseudo-labels being introduced even before adaptation and further reinforced through parameter updates. Additionally, they continuously influence neighbor samples through propagation in the feature space.To eliminate the issue of noise accumulation, we propose a novel Progressive Curriculum Labeling (ElimPCL) method, which iteratively filters trustworthy pseudo-labeled samples based on prototype consistency to exclude high-noise samples from training. Furthermore, a Dual MixUP technique is designed in the feature space to enhance the separability of hard samples, thereby mitigating the interference of noisy samples on their neighbors.Extensive experiments validate the effectiveness of ElimPCL, achieving up to a 3.4% improvement on challenging tasks compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23712v1-abstract-full').style.display = 'none'; document.getElementById('2503.23712v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICME 2025 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22767">arXiv:2503.22767</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.22767">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Co-design of materials, structures and stimuli for magnetic soft robots with large deformation and dynamic contacts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22767v1-abstract-short" style="display: inline;"> Magnetic soft robots embedded with hard magnetic particles enable untethered actuation via external magnetic fields, offering remote, rapid, and precise control, which is highly promising for biomedical applications. However, designing such systems is challenging due to the complex interplay of magneto-elastic dynamics, large deformation, solid contacts, time-varying stimuli, and posture-dependent&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22767v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22767v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22767v1-abstract-full" style="display: none;"> Magnetic soft robots embedded with hard magnetic particles enable untethered actuation via external magnetic fields, offering remote, rapid, and precise control, which is highly promising for biomedical applications. However, designing such systems is challenging due to the complex interplay of magneto-elastic dynamics, large deformation, solid contacts, time-varying stimuli, and posture-dependent loading. As a result, most existing research relies on heuristics and trial-and-error methods or focuses on the independent design of stimuli or structures under static conditions. We propose a topology optimization framework for magnetic soft robots that simultaneously designs structures, location-specific material magnetization and time-varying magnetic stimuli, accounting for large deformations, dynamic motion, and solid contacts. This is achieved by integrating generalized topology optimization with the magneto-elastic material point method, which supports GPU-accelerated parallel simulations and auto-differentiation for sensitivity analysis. We applied this framework to design magnetic robots for various tasks, including multi-task shape morphing and locomotion, in both 2D and 3D. The method autonomously generates optimized robotic systems to achieve target behaviors without requiring human intervention. Despite the nonlinear physics and large design space, it demonstrates exceptional efficiency, completing all cases within minutes. This proposed framework represents a significant step toward the automatic co-design of magnetic soft robots for applications such as metasurfaces, drug delivery, and minimally invasive procedures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22767v1-abstract-full').style.display = 'none'; document.getElementById('2503.22767v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22724">arXiv:2503.22724</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.22724">pdf</a>, <a href="https://arxiv.org/format/2503.22724">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Spatial-temporal Deep Probabilistic Diffusion Model for Reliable Hail Nowcasting with Radar Echo Extrapolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+H">Haonan Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+L">Long Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+J">Jie Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yufei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liming Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22724v1-abstract-short" style="display: inline;"> Hail nowcasting is a considerable contributor to meteorological disasters and there is a great need to mitigate its socioeconomic effects through precise forecast that has high resolution, long lead times and local details with large landscapes. Existing medium-range weather forecasting methods primarily rely on changes in upper air currents and cloud layers to predict precipitation events, such a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22724v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22724v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22724v1-abstract-full" style="display: none;"> Hail nowcasting is a considerable contributor to meteorological disasters and there is a great need to mitigate its socioeconomic effects through precise forecast that has high resolution, long lead times and local details with large landscapes. Existing medium-range weather forecasting methods primarily rely on changes in upper air currents and cloud layers to predict precipitation events, such as heavy rainfall, which are unsuitable for hail nowcasting since it is mainly caused by low-altitude local strong convection associated with terrains. Additionally, radar captures the status of low cloud layers, such as water vapor, droplets, and ice crystals, providing rich signals suitable for hail nowcasting. To this end, we introduce a Spatial-Temporal gEnerAtive Model called SteamCast for hail nowcasting with radar echo extrapolation, it is a deep probabilistic diffusion model based on spatial-temporal representations including radar echoes as well as their position/time embeddings, which we trained on historical reanalysis archive from Yan&#39;an Meteorological Bureau in China, where the crop yield like apple suffers greatly from hail damage. Considering the short-term nature of hail, SteamCast provides 30-minute nowcasts at 6-minute intervals for a single radar reflectivity variable, across 9 different vertical angles, on a latitude-longitude grid with approximately 1 km * 1 km resolution per pixel in Yan&#39;an City, China. By successfully fusing the spatial-temporal features of radar echoes, SteamCast delivers competitive, and in some cases superior, results compared to other deep learning-based models such as PredRNN and VMRNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22724v1-abstract-full').style.display = 'none'; document.getElementById('2503.22724v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22262">arXiv:2503.22262</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.22262">pdf</a>, <a href="https://arxiv.org/format/2503.22262">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mono2Stereo: A Benchmark and Empirical Study for Stereo Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Songsong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Z">Zhongang Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Z">Zeke Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22262v1-abstract-short" style="display: inline;"> With the rapid proliferation of 3D devices and the shortage of 3D content, stereo conversion is attracting increasing attention. Recent works introduce pretrained Diffusion Models (DMs) into this task. However, due to the scarcity of large-scale training data and comprehensive benchmarks, the optimal methodologies for employing DMs in stereo conversion and the accurate evaluation of stereo effects&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22262v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22262v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22262v1-abstract-full" style="display: none;"> With the rapid proliferation of 3D devices and the shortage of 3D content, stereo conversion is attracting increasing attention. Recent works introduce pretrained Diffusion Models (DMs) into this task. However, due to the scarcity of large-scale training data and comprehensive benchmarks, the optimal methodologies for employing DMs in stereo conversion and the accurate evaluation of stereo effects remain largely unexplored. In this work, we introduce the Mono2Stereo dataset, providing high-quality training data and benchmark to support in-depth exploration of stereo conversion. With this dataset, we conduct an empirical study that yields two primary findings. 1) The differences between the left and right views are subtle, yet existing metrics consider overall pixels, failing to concentrate on regions critical to stereo effects. 2) Mainstream methods adopt either one-stage left-to-right generation or warp-and-inpaint pipeline, facing challenges of degraded stereo effect and image distortion respectively. Based on these findings, we introduce a new evaluation metric, Stereo Intersection-over-Union, which prioritizes disparity and achieves a high correlation with human judgments on stereo effect. Moreover, we propose a strong baseline model, harmonizing the stereo effect and image quality simultaneously, and notably surpassing current mainstream methods. Our code and data will be open-sourced to promote further research in stereo conversion. Our models are available at mono2stereo-bench.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22262v1-abstract-full').style.display = 'none'; document.getElementById('2503.22262v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025 Project webpage: https://mono2stereo-bench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22224">arXiv:2503.22224</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.22224">pdf</a>, <a href="https://arxiv.org/format/2503.22224">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Composite Indicator-Guided Infilling Sampling for Expensive Multi-Objective Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhen%2C+H">Huixiang Zhen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaotong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+W">Wenyin Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Ling Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiangyun Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22224v1-abstract-short" style="display: inline;"> In expensive multi-objective optimization, where the evaluation budget is strictly limited, selecting promising candidate solutions for expensive fitness evaluations is critical for accelerating convergence and improving algorithmic performance. However, designing an optimization strategy that effectively balances convergence, diversity, and distribution remains a challenge. To tackle this issue,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22224v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22224v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22224v1-abstract-full" style="display: none;"> In expensive multi-objective optimization, where the evaluation budget is strictly limited, selecting promising candidate solutions for expensive fitness evaluations is critical for accelerating convergence and improving algorithmic performance. However, designing an optimization strategy that effectively balances convergence, diversity, and distribution remains a challenge. To tackle this issue, we propose a composite indicator-based evolutionary algorithm (CI-EMO) for expensive multi-objective optimization. In each generation of the optimization process, CI-EMO first employs NSGA-III to explore the solution space based on fitness values predicted by surrogate models, generating a candidate population. Subsequently, we design a novel composite performance indicator to guide the selection of candidates for real fitness evaluation. This indicator simultaneously considers convergence, diversity, and distribution to improve the efficiency of identifying promising candidate solutions, which significantly improves algorithm performance. The composite indicator-based candidate selection strategy is easy to achieve and computes efficiency. Component analysis experiments confirm the effectiveness of each element in the composite performance indicator. Comparative experiments on benchmark problems demonstrate that the proposed algorithm outperforms five state-of-the-art expensive multi-objective optimization algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22224v1-abstract-full').style.display = 'none'; document.getElementById('2503.22224v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22118">arXiv:2503.22118</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.22118">pdf</a>, <a href="https://arxiv.org/format/2503.22118">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Estimating City-wide operating mode Distribution of Light-Duty Vehicles: A Neural Network-based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Usama%2C+M">Muhammad Usama</a>, <a href="/search/cs?searchtype=author&amp;query=Koutsopoulos%2C+H+N">Haris N. Koutsopoulos</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zhengbing He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijiao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22118v1-abstract-short" style="display: inline;"> Driving cycles are a set of driving conditions and are crucial for the existing emission estimation model to evaluate vehicle performance, fuel efficiency, and emissions, by matching them with average speed to calculate the operating modes, such as braking, idling, and cruising. While existing emission estimation models, such as the Motor Vehicle Emission Simulator (MOVES), are powerful tools, the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22118v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22118v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22118v1-abstract-full" style="display: none;"> Driving cycles are a set of driving conditions and are crucial for the existing emission estimation model to evaluate vehicle performance, fuel efficiency, and emissions, by matching them with average speed to calculate the operating modes, such as braking, idling, and cruising. While existing emission estimation models, such as the Motor Vehicle Emission Simulator (MOVES), are powerful tools, their reliance on predefined driving cycles can be limiting, as these cycles often do not accurately represent regional driving conditions, making the models less effective for city-wide analyses. To solve this problem, this paper proposes a modular neural network (NN)-based framework to estimate operating mode distributions bypassing the driving cycle development phase, utilizing macroscopic variables such as speed, flow, and link infrastructure attributes. The proposed method is validated using a well-calibrated microsimulation model of Brookline MA, the United States. The results indicate that the proposed framework outperforms the operating mode distribution calculated by MOVES based on default driving cycles, providing a closer match to the actual operating mode distribution derived from trajectory data. Specifically, the proposed model achieves an average RMSE of 0.04 in predicting operating mode distribution, compared to 0.08 for MOVES. The average error in emission estimation across pollutants is 8.57% for the proposed method, lower than the 32.86% error for MOVES. In particular, for the estimation of CO2, the proposed method has an error of just 4%, compared to 35% for MOVES. The proposed model can be utilized for real-time emissions monitoring by providing rapid and accurate emissions estimates with easily accessible inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22118v1-abstract-full').style.display = 'none'; document.getElementById('2503.22118v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21679">arXiv:2503.21679</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.21679">pdf</a>, <a href="https://arxiv.org/format/2503.21679">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> JiraiBench: A Bilingual Benchmark for Evaluating Large Language Models&#39; Detection of Human Self-Destructive Behavior Content in Jirai Community </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yunze Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+T">Tingyu He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L+Z">Lionel Z. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yiming Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xingyu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xiaohang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+I">Irene Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ng%2C+K+C">Ka Chung Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21679v2-abstract-short" style="display: inline;"> This paper introduces JiraiBench, the first bilingual benchmark for evaluating large language models&#39; effectiveness in detecting self-destructive content across Chinese and Japanese social media communities. Focusing on the transnational &#34;Jirai&#34; (landmine) online subculture that encompasses multiple forms of self-destructive behaviors including drug overdose, eating disorders, and self-harm, we pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21679v2-abstract-full').style.display = 'inline'; document.getElementById('2503.21679v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21679v2-abstract-full" style="display: none;"> This paper introduces JiraiBench, the first bilingual benchmark for evaluating large language models&#39; effectiveness in detecting self-destructive content across Chinese and Japanese social media communities. Focusing on the transnational &#34;Jirai&#34; (landmine) online subculture that encompasses multiple forms of self-destructive behaviors including drug overdose, eating disorders, and self-harm, we present a comprehensive evaluation framework incorporating both linguistic and cultural dimensions. Our dataset comprises 10,419 Chinese posts and 5,000 Japanese posts with multidimensional annotation along three behavioral categories, achieving substantial inter-annotator agreement. Experimental evaluations across four state-of-the-art models reveal significant performance variations based on instructional language, with Japanese prompts unexpectedly outperforming Chinese prompts when processing Chinese content. This emergent cross-cultural transfer suggests that cultural proximity can sometimes outweigh linguistic similarity in detection tasks. Cross-lingual transfer experiments with fine-tuned models further demonstrate the potential for knowledge transfer between these language systems without explicit target language training. These findings highlight the need for culturally-informed approaches to multilingual content moderation and provide empirical evidence for the importance of cultural context in developing more effective detection systems for vulnerable online communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21679v2-abstract-full').style.display = 'none'; document.getElementById('2503.21679v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 1 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21571">arXiv:2503.21571</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.21571">pdf</a>, <a href="https://arxiv.org/format/2503.21571">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Magnitude-Phase Dual-Path Speech Enhancement Network based on Self-Supervised Embedding and Perceptual Contrast Stretch Boosting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mattursun%2C+A">Alimjan Mattursun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liejun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yinfeng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chunyang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21571v1-abstract-short" style="display: inline;"> Speech self-supervised learning (SSL) has made great progress in various speech processing tasks, but there is still room for improvement in speech enhancement (SE). This paper presents BSP-MPNet, a dual-path framework that combines self-supervised features with magnitude-phase information for SE. The approach starts by applying the perceptual contrast stretching (PCS) algorithm to enhance the mag&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21571v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21571v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21571v1-abstract-full" style="display: none;"> Speech self-supervised learning (SSL) has made great progress in various speech processing tasks, but there is still room for improvement in speech enhancement (SE). This paper presents BSP-MPNet, a dual-path framework that combines self-supervised features with magnitude-phase information for SE. The approach starts by applying the perceptual contrast stretching (PCS) algorithm to enhance the magnitude-phase spectrum. A magnitude-phase 2D coarse (MP-2DC) encoder then extracts coarse features from the enhanced spectrum. Next, a feature-separating self-supervised learning (FS-SSL) model generates self-supervised embeddings for the magnitude and phase components separately. These embeddings are fused to create cross-domain feature representations. Finally, two parallel RNN-enhanced multi-attention (REMA) mask decoders refine the features, apply them to the mask, and reconstruct the speech signal. We evaluate BSP-MPNet on the VoiceBank+DEMAND and WHAMR! datasets. Experimental results show that BSP-MPNet outperforms existing methods under various noise conditions, providing new directions for self-supervised speech enhancement research. The implementation of the BSP-MPNet code is available online\footnote[2]{https://github.com/AlimMat/BSP-MPNet. \label{s1}} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21571v1-abstract-full').style.display = 'none'; document.getElementById('2503.21571v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Main paper (6 pages). Accepted for publication by ICME 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20782">arXiv:2503.20782</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20782">pdf</a>, <a href="https://arxiv.org/format/2503.20782">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Audio-Visual Editing via Cross-Modal Delta Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yan-Bo Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jianfeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Chung-Ching Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Bertasius%2C+G">Gedas Bertasius</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20782v1-abstract-short" style="display: inline;"> In this paper, we introduce zero-shot audio-video editing, a novel task that requires transforming original audio-visual content to align with a specified textual prompt without additional model training. To evaluate this task, we curate a benchmark dataset, AvED-Bench, designed explicitly for zero-shot audio-video editing. AvED-Bench includes 110 videos, each with a 10-second duration, spanning 1&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20782v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20782v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20782v1-abstract-full" style="display: none;"> In this paper, we introduce zero-shot audio-video editing, a novel task that requires transforming original audio-visual content to align with a specified textual prompt without additional model training. To evaluate this task, we curate a benchmark dataset, AvED-Bench, designed explicitly for zero-shot audio-video editing. AvED-Bench includes 110 videos, each with a 10-second duration, spanning 11 categories from VGGSound. It offers diverse prompts and scenarios that require precise alignment between auditory and visual elements, enabling robust evaluation. We identify limitations in existing zero-shot audio and video editing methods, particularly in synchronization and coherence between modalities, which often result in inconsistent outcomes. To address these challenges, we propose AvED, a zero-shot cross-modal delta denoising framework that leverages audio-video interactions to achieve synchronized and coherent edits. AvED demonstrates superior results on both AvED-Bench and the recent OAVE dataset to validate its generalization capabilities. Results are available at https://genjib.github.io/project_page/AVED/index.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20782v1-abstract-full').style.display = 'none'; document.getElementById('2503.20782v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://genjib.github.io/project_page/AVED/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20521">arXiv:2503.20521</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20521">pdf</a>, <a href="https://arxiv.org/format/2503.20521">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Decremental Dynamics Planning for Robot Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yuanjie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linji Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hawes%2C+N">Nick Hawes</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xuesu Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20521v1-abstract-short" style="display: inline;"> Most, if not all, robot navigation systems employ a decomposed planning framework that includes global and local planning. To trade-off onboard computation and plan quality, current systems have to limit all robot dynamics considerations only within the local planner, while leveraging an extremely simplified robot representation (e.g., a point-mass holonomic model without dynamics) in the global l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20521v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20521v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20521v1-abstract-full" style="display: none;"> Most, if not all, robot navigation systems employ a decomposed planning framework that includes global and local planning. To trade-off onboard computation and plan quality, current systems have to limit all robot dynamics considerations only within the local planner, while leveraging an extremely simplified robot representation (e.g., a point-mass holonomic model without dynamics) in the global level. However, such an artificial decomposition based on either full or zero consideration of robot dynamics can lead to gaps between the two levels, e.g., a global path based on a holonomic point-mass model may not be realizable by a non-holonomic robot, especially in highly constrained obstacle environments. Motivated by such a limitation, we propose a novel paradigm, Decremental Dynamics Planning that integrates dynamic constraints into the entire planning process, with a focus on high-fidelity dynamics modeling at the beginning and a gradual fidelity reduction as the planning progresses. To validate the effectiveness of this paradigm, we augment three different planners with DDP and show overall improved planning performance. We also develop a new DDP-based navigation system, which achieves first place in the simulation phase of the 2025 BARN Challenge. Both simulated and physical experiments validate DDP&#39;s hypothesized benefits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20521v1-abstract-full').style.display = 'none'; document.getElementById('2503.20521v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. 2025 International Conference on Intelligent Robots and Systems (IROS 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20430">arXiv:2503.20430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20430">pdf</a>, <a href="https://arxiv.org/format/2503.20430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RALLRec+: Retrieval Augmented Large Language Model Recommendation with Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+S">Sichun Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaojie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linrong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Sicong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+H">Hanxu Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+L">Linqi Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20430v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been integrated into recommender systems to enhance user behavior comprehension. The Retrieval Augmented Generation (RAG) technique is further incorporated into these systems to retrieve more relevant items and improve system performance. However, existing RAG methods have two shortcomings. \textit{(i)} In the \textit{retrieval} stage, they rely primarily on textu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20430v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20430v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20430v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have been integrated into recommender systems to enhance user behavior comprehension. The Retrieval Augmented Generation (RAG) technique is further incorporated into these systems to retrieve more relevant items and improve system performance. However, existing RAG methods have two shortcomings. \textit{(i)} In the \textit{retrieval} stage, they rely primarily on textual semantics and often fail to incorporate the most relevant items, thus constraining system effectiveness. \textit{(ii)} In the \textit{generation} stage, they lack explicit chain-of-thought reasoning, further limiting their potential. In this paper, we propose Representation learning and \textbf{R}easoning empowered retrieval-\textbf{A}ugmented \textbf{L}arge \textbf{L}anguage model \textbf{Rec}ommendation (RALLRec+). Specifically, for the retrieval stage, we prompt LLMs to generate detailed item descriptions and perform joint representation learning, combining textual and collaborative signals extracted from the LLM and recommendation models, respectively. To account for the time-varying nature of user interests, we propose a simple yet effective reranking method to capture preference dynamics. For the generation phase, we first evaluate reasoning LLMs on recommendation tasks, uncovering valuable insights. Then we introduce knowledge-injected prompting and consistency-based merging approach to integrate reasoning LLMs with general-purpose LLMs, enhancing overall performance. Extensive experiments on three real world datasets validate our method&#39;s effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20430v1-abstract-full').style.display = 'none'; document.getElementById('2503.20430v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2502.06101</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20368">arXiv:2503.20368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20368">pdf</a>, <a href="https://arxiv.org/format/2503.20368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pluggable Style Representation Learning for Multi-Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongda Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Longguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+W">Weijun Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yulan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20368v1-abstract-short" style="display: inline;"> Due to the high diversity of image styles, the scalability to various styles plays a critical role in real-world applications. To accommodate a large amount of styles, previous multi-style transfer approaches rely on enlarging the model size while arbitrary-style transfer methods utilize heavy backbones. However, the additional computational cost introduced by more model parameters hinders these m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20368v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20368v1-abstract-full" style="display: none;"> Due to the high diversity of image styles, the scalability to various styles plays a critical role in real-world applications. To accommodate a large amount of styles, previous multi-style transfer approaches rely on enlarging the model size while arbitrary-style transfer methods utilize heavy backbones. However, the additional computational cost introduced by more model parameters hinders these methods to be deployed on resource-limited devices. To address this challenge, in this paper, we develop a style transfer framework by decoupling the style modeling and transferring. Specifically, for style modeling, we propose a style representation learning scheme to encode the style information into a compact representation. Then, for style transferring, we develop a style-aware multi-style transfer network (SaMST) to adapt to diverse styles using pluggable style representations. In this way, our framework is able to accommodate diverse image styles in the learned style representations without introducing additional overhead during inference, thereby maintaining efficiency. Experiments show that our style representation can extract accurate style information. Moreover, qualitative and quantitative results demonstrate that our method achieves state-of-the-art performance in terms of both accuracy and efficiency. The codes are available in https://github.com/The-Learning-And-Vision-Atelier-LAVA/SaMST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20368v1-abstract-full').style.display = 'none'; document.getElementById('2503.20368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 13 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20331">arXiv:2503.20331</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20331">pdf</a>, <a href="https://arxiv.org/format/2503.20331">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3594739.3610706">10.1145/3594739.3610706 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> WiCross: Indoor Human Zone-Crossing Detection Using Commodity WiFi Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Weiyan Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xuanzhi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+K">Kai Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Leye Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Daqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20331v1-abstract-short" style="display: inline;"> Detecting whether a target crosses the given zone (e.g., a door) can enable various practical applications in smart homes, including intelligent security and people counting. The traditional infrared-based approach only covers a line and can be easily cracked. In contrast, reusing the ubiquitous WiFi devices deployed in homes has the potential to cover a larger area of interest as WiFi signals are&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20331v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20331v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20331v1-abstract-full" style="display: none;"> Detecting whether a target crosses the given zone (e.g., a door) can enable various practical applications in smart homes, including intelligent security and people counting. The traditional infrared-based approach only covers a line and can be easily cracked. In contrast, reusing the ubiquitous WiFi devices deployed in homes has the potential to cover a larger area of interest as WiFi signals are scattered throughout the entire space. By detecting the walking direction (i.e., approaching and moving away) with WiFi signal strength change, existing work can identify the behavior of crossing between WiFi transceiver pair. However, this method mistakenly classifies the turn-back behavior as crossing behavior, resulting in a high false alarm rate. In this paper, we propose WiCross, which can accurately distinguish the turn-back behavior with the phase statistics pattern of WiFi signals and thus robustly identify whether the target crosses the area between the WiFi transceiver pair. We implement WiCross with commercial WiFi devices and extensive experiments demonstrate that WiCross can achieve an accuracy higher than 95\% with a false alarm rate of less than 5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20331v1-abstract-full').style.display = 'none'; document.getElementById('2503.20331v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACM 2023 UbiComp/ISWC (Poster)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20198">arXiv:2503.20198</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20198">pdf</a>, <a href="https://arxiv.org/format/2503.20198">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Words: Advancing Long-Text Image Generation via Multimodal Autoregressive Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+A+J">Alex Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Min Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20198v1-abstract-short" style="display: inline;"> Recent advancements in autoregressive and diffusion models have led to strong performance in image generation with short scene text words. However, generating coherent, long-form text in images, such as paragraphs in slides or documents, remains a major challenge for current generative models. We present the first work specifically focused on long text image generation, addressing a critical gap i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20198v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20198v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20198v1-abstract-full" style="display: none;"> Recent advancements in autoregressive and diffusion models have led to strong performance in image generation with short scene text words. However, generating coherent, long-form text in images, such as paragraphs in slides or documents, remains a major challenge for current generative models. We present the first work specifically focused on long text image generation, addressing a critical gap in existing text-to-image systems that typically handle only brief phrases or single sentences. Through comprehensive analysis of state-of-the-art autoregressive generation models, we identify the image tokenizer as a critical bottleneck in text generating quality. To address this, we introduce a novel text-focused, binary tokenizer optimized for capturing detailed scene text features. Leveraging our tokenizer, we develop \ModelName, a multimodal autoregressive model that excels in generating high-quality long-text images with unprecedented fidelity. Our model offers robust controllability, enabling customization of text properties such as font style, size, color, and alignment. Extensive experiments demonstrate that \ModelName~significantly outperforms SD3.5 Large~\cite{sd3} and GPT4o~\cite{gpt4o} with DALL-E 3~\cite{dalle3} in generating long text accurately, consistently, and flexibly. Beyond its technical achievements, \ModelName~opens up exciting opportunities for innovative applications like interleaved document and PowerPoint generation, establishing a new frontier in long-text image generating. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20198v1-abstract-full').style.display = 'none'; document.getElementById('2503.20198v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19897">arXiv:2503.19897</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.19897">pdf</a>, <a href="https://arxiv.org/format/2503.19897">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scaling Down Text Encoders of Text-to-Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lifu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Daqing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinchen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xiaodong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19897v1-abstract-short" style="display: inline;"> Text encoders in diffusion models have rapidly evolved, transitioning from CLIP to T5-XXL. Although this evolution has significantly enhanced the models&#39; ability to understand complex prompts and generate text, it also leads to a substantial increase in the number of parameters. Despite T5 series encoders being trained on the C4 natural language corpus, which includes a significant amount of non-v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19897v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19897v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19897v1-abstract-full" style="display: none;"> Text encoders in diffusion models have rapidly evolved, transitioning from CLIP to T5-XXL. Although this evolution has significantly enhanced the models&#39; ability to understand complex prompts and generate text, it also leads to a substantial increase in the number of parameters. Despite T5 series encoders being trained on the C4 natural language corpus, which includes a significant amount of non-visual data, diffusion models with T5 encoder do not respond to those non-visual prompts, indicating redundancy in representational power. Therefore, it raises an important question: &#34;Do we really need such a large text encoder?&#34; In pursuit of an answer, we employ vision-based knowledge distillation to train a series of T5 encoder models. To fully inherit its capabilities, we constructed our dataset based on three criteria: image quality, semantic understanding, and text-rendering. Our results demonstrate the scaling down pattern that the distilled T5-base model can generate images of comparable quality to those produced by T5-XXL, while being 50 times smaller in size. This reduction in model size significantly lowers the GPU requirements for running state-of-the-art models such as FLUX and SD3, making high-quality text-to-image generation more accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19897v1-abstract-full').style.display = 'none'; document.getElementById('2503.19897v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19708">arXiv:2503.19708</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.19708">pdf</a>, <a href="https://arxiv.org/format/2503.19708">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data-efficient rapid prediction of urban airflow and temperature fields for complex building geometries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+S">Shaoxiang Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+D">Dongxue Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Marey%2C+A">Ahmed Marey</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+D">Dingyang Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Potsis%2C+T">Theodore Potsis</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L+L">Liangzhu Leon Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19708v1-abstract-short" style="display: inline;"> Accurately predicting urban microclimate, including wind speed and temperature, based solely on building geometry requires capturing complex interactions between buildings and airflow, particularly long-range wake effects influenced by directional geometry. Traditional methods relying on computational fluid dynamics (CFD) are prohibitively expensive for large-scale simulations, while data-driven a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19708v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19708v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19708v1-abstract-full" style="display: none;"> Accurately predicting urban microclimate, including wind speed and temperature, based solely on building geometry requires capturing complex interactions between buildings and airflow, particularly long-range wake effects influenced by directional geometry. Traditional methods relying on computational fluid dynamics (CFD) are prohibitively expensive for large-scale simulations, while data-driven approaches struggle with limited training data and the need to model both local and far-field dependencies. In response, we propose a novel framework that leverages a multi-directional distance feature (MDDF) combined with localized training to achieve effective wind field predictions with minimal CFD data. By reducing the problem&#39;s dimensionality, localized training effectively increases the number of training samples, while MDDF encodes the surrounding geometric information to accurately model wake dynamics and flow redirection. Trained on only 24 CFD simulations, our localized Fourier neural operator (Local-FNO) model generates full 3D wind velocity and temperature predictions in under one minute, yielding a 500-fold speedup over conventional CFD methods. With mean absolute errors of 0.3 m/s for wind speed and 0.3 $^{\circ}$C for temperature on unseen urban configurations, our method demonstrates strong generalization capabilities and significant potential for practical urban applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19708v1-abstract-full').style.display = 'none'; document.getElementById('2503.19708v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19407">arXiv:2503.19407</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.19407">pdf</a>, <a href="https://arxiv.org/format/2503.19407">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Prototype-Guided Coarse Annotations Refining Approach for Whole Slide Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+B">Bingjian Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Weiping Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yan He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liangsheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19407v1-abstract-short" style="display: inline;"> The fine-grained annotations in whole slide images (WSIs) show the boundaries of various pathological regions. However, generating such detailed annotation is often costly, whereas the coarse annotations are relatively simpler to produce. Existing methods for refining coarse annotations often rely on extensive training samples or clean datasets, and fail to capture both intra-slide and inter-slide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19407v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19407v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19407v1-abstract-full" style="display: none;"> The fine-grained annotations in whole slide images (WSIs) show the boundaries of various pathological regions. However, generating such detailed annotation is often costly, whereas the coarse annotations are relatively simpler to produce. Existing methods for refining coarse annotations often rely on extensive training samples or clean datasets, and fail to capture both intra-slide and inter-slide latent sematic patterns, limiting their precision. In this paper, we propose a prototype-guided approach. Specifically, we introduce a local-to-global approach to construct non-redundant representative prototypes by jointly modeling intra-slide local semantics and inter-slide contextual relationships. Then a prototype-guided pseudo-labeling module is proposed for refining coarse annotations. Finally, we employ dynamic data sampling and re-finetuning strategy to train a patch classifier. Extensive experiments on three publicly available WSI datasets, covering lymph, liver, and colorectal cancers, demonstrate that our method significantly outperforms existing state-of-the-art (SOTA) methods. The code will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19407v1-abstract-full').style.display = 'none'; document.getElementById('2503.19407v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19312">arXiv:2503.19312</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.19312">pdf</a>, <a href="https://arxiv.org/format/2503.19312">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ImageGen-CoT: Enhancing Text-to-Image In-context Learning with Chain-of-Thought Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dianqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19312v1-abstract-short" style="display: inline;"> In this work, we study the problem of Text-to-Image In-Context Learning (T2I-ICL). While Unified Multimodal LLMs (MLLMs) have advanced rapidly in recent years, they struggle with contextual reasoning in T2I-ICL scenarios. To address this limitation, we propose a novel framework that incorporates a thought process called ImageGen-CoT prior to image generation. To avoid generating unstructured ineff&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19312v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19312v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19312v1-abstract-full" style="display: none;"> In this work, we study the problem of Text-to-Image In-Context Learning (T2I-ICL). While Unified Multimodal LLMs (MLLMs) have advanced rapidly in recent years, they struggle with contextual reasoning in T2I-ICL scenarios. To address this limitation, we propose a novel framework that incorporates a thought process called ImageGen-CoT prior to image generation. To avoid generating unstructured ineffective reasoning steps, we develop an automatic pipeline to curate a high-quality ImageGen-CoT dataset. We then fine-tune MLLMs using this dataset to enhance their contextual reasoning capabilities. To further enhance performance, we explore test-time scale-up strategies and propose a novel hybrid scaling approach. This approach first generates multiple ImageGen-CoT chains and then produces multiple images for each chain via sampling. Extensive experiments demonstrate the effectiveness of our proposed method. Notably, fine-tuning with the ImageGen-CoT dataset leads to a substantial 80\% performance gain for SEED-X on T2I-ICL tasks. See our project page at https://ImageGen-CoT.github.io/. Code and model weights will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19312v1-abstract-full').style.display = 'none'; document.getElementById('2503.19312v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ImageGen-CoT.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19012">arXiv:2503.19012</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.19012">pdf</a>, <a href="https://arxiv.org/format/2503.19012">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffV2IR: Visible-to-Infrared Diffusion Model via Vision-Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ran%2C+L">Lingyan Ran</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guangcong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19012v1-abstract-short" style="display: inline;"> The task of translating visible-to-infrared images (V2IR) is inherently challenging due to three main obstacles: 1) achieving semantic-aware translation, 2) managing the diverse wavelength spectrum in infrared imagery, and 3) the scarcity of comprehensive infrared datasets. Current leading methods tend to treat V2IR as a conventional image-to-image synthesis challenge, often overlooking these spec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19012v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19012v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19012v1-abstract-full" style="display: none;"> The task of translating visible-to-infrared images (V2IR) is inherently challenging due to three main obstacles: 1) achieving semantic-aware translation, 2) managing the diverse wavelength spectrum in infrared imagery, and 3) the scarcity of comprehensive infrared datasets. Current leading methods tend to treat V2IR as a conventional image-to-image synthesis challenge, often overlooking these specific issues. To address this, we introduce DiffV2IR, a novel framework for image translation comprising two key elements: a Progressive Learning Module (PLM) and a Vision-Language Understanding Module (VLUM). PLM features an adaptive diffusion model architecture that leverages multi-stage knowledge learning to infrared transition from full-range to target wavelength. To improve V2IR translation, VLUM incorporates unified Vision-Language Understanding. We also collected a large infrared dataset, IR-500K, which includes 500,000 infrared images compiled by various scenes and objects under various environmental conditions. Through the combination of PLM, VLUM, and the extensive IR-500K dataset, DiffV2IR markedly improves the performance of V2IR. Experiments validate DiffV2IR&#39;s excellence in producing high-quality translations, establishing its efficacy and broad applicability. The code, dataset, and DiffV2IR model will be available at https://github.com/LidongWang-26/DiffV2IR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19012v1-abstract-full').style.display = 'none'; document.getElementById('2503.19012v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://diffv2ir.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18458">arXiv:2503.18458</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.18458">pdf</a>, <a href="https://arxiv.org/format/2503.18458">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> StableGS: A Floater-Free Framework for 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Luchao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Q">Qian Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+K">Kaimin Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hua Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yaohua Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18458v2-abstract-short" style="display: inline;"> Recent years have witnessed remarkable success of 3D Gaussian Splatting (3DGS) in novel view synthesis, surpassing prior differentiable rendering methods in both quality and efficiency. However, its training process suffers from coupled opacity-color optimization that frequently converges to local minima, producing floater artifacts that degrade visual fidelity. We present StableGS, a framework th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18458v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18458v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18458v2-abstract-full" style="display: none;"> Recent years have witnessed remarkable success of 3D Gaussian Splatting (3DGS) in novel view synthesis, surpassing prior differentiable rendering methods in both quality and efficiency. However, its training process suffers from coupled opacity-color optimization that frequently converges to local minima, producing floater artifacts that degrade visual fidelity. We present StableGS, a framework that eliminates floaters through cross-view depth consistency constraints while introducing a dual-opacity GS model to decouple geometry and material properties of translucent objects. To further enhance reconstruction quality in weakly-textured regions, we integrate DUSt3R depth estimation, significantly improving geometric stability. Our method fundamentally addresses 3DGS training instabilities, outperforming existing state-of-the-art methods across open-source datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18458v2-abstract-full').style.display = 'none'; document.getElementById('2503.18458v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18160">arXiv:2503.18160</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.18160">pdf</a>, <a href="https://arxiv.org/format/2503.18160">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MAO: Efficient Model-Agnostic Optimization of Prompt Tuning for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S">Siyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+G">Guodong Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18160v1-abstract-short" style="display: inline;"> Though CLIP-based prompt tuning significantly enhances pre-trained Vision-Language Models, existing research focuses on reconstructing the model architecture, e.g., additional loss calculation and meta-networks. These approaches generally lead to increased complexity and extended training cost. To maintain the efficiency of the tuning process, we propose plug-and-play Model-Agnostic Optimization (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18160v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18160v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18160v1-abstract-full" style="display: none;"> Though CLIP-based prompt tuning significantly enhances pre-trained Vision-Language Models, existing research focuses on reconstructing the model architecture, e.g., additional loss calculation and meta-networks. These approaches generally lead to increased complexity and extended training cost. To maintain the efficiency of the tuning process, we propose plug-and-play Model-Agnostic Optimization (MAO) for prompt tuning. Without altering any components of the prompt tuning backbone, we introduce a Data-Driven Enhancement framework to optimize the distribution of the initial data, and incorporate an Alterable Regularization module to boost the task-specific feature processing pipeline, thereby improving overall performance while maintaining low computational cost. Extensive experiments on MAO demonstrate its outstanding performance and efficiency. The code of MAO is available at: https://github.com/JREion/M.A.O . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18160v1-abstract-full').style.display = 'none'; document.getElementById('2503.18160v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the IEEE International Conference on Multimedia &amp; Expo 2025 (ICME 2025); 12 pages, 6 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17992">arXiv:2503.17992</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.17992">pdf</a>, <a href="https://arxiv.org/format/2503.17992">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Geometric Constrained Non-Line-of-Sight Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xueying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lianfang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+Y">Yuping Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17992v1-abstract-short" style="display: inline;"> Normal reconstruction is crucial in non-line-of-sight (NLOS) imaging, as it provides key geometric and lighting information about hidden objects, which significantly improves reconstruction accuracy and scene understanding. However, jointly estimating normals and albedo expands the problem from matrix-valued functions to tensor-valued functions that substantially increasing complexity and computat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17992v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17992v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17992v1-abstract-full" style="display: none;"> Normal reconstruction is crucial in non-line-of-sight (NLOS) imaging, as it provides key geometric and lighting information about hidden objects, which significantly improves reconstruction accuracy and scene understanding. However, jointly estimating normals and albedo expands the problem from matrix-valued functions to tensor-valued functions that substantially increasing complexity and computational difficulty. In this paper, we propose a novel joint albedo-surface reconstruction method, which utilizes the Frobenius norm of the shape operator to control the variation rate of the normal field. It is the first attempt to apply regularization methods to the reconstruction of surface normals for hidden objects. By improving the accuracy of the normal field, it enhances detail representation and achieves high-precision reconstruction of hidden object geometry. The proposed method demonstrates robustness and effectiveness on both synthetic and experimental datasets. On transient data captured within 15 seconds, our surface normal-regularized reconstruction model produces more accurate surfaces than recently proposed methods and is 30 times faster than the existing surface reconstruction approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17992v1-abstract-full').style.display = 'none'; document.getElementById('2503.17992v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17987">arXiv:2503.17987</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.17987">pdf</a>, <a href="https://arxiv.org/format/2503.17987">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Metaphor-based Jailbreaking Attacks on Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yiwen Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lanjun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenhui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+Y">Yi Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">An-An Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17987v1-abstract-short" style="display: inline;"> To mitigate misuse, text-to-image~(T2I) models commonly incorporate safety filters to prevent the generation of sensitive images. Unfortunately, recent jailbreaking attack methods use LLMs to generate adversarial prompts that effectively bypass safety filters while generating sensitive images, revealing the safety vulnerabilities within the T2I model. However, existing LLM-based attack methods lac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17987v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17987v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17987v1-abstract-full" style="display: none;"> To mitigate misuse, text-to-image~(T2I) models commonly incorporate safety filters to prevent the generation of sensitive images. Unfortunately, recent jailbreaking attack methods use LLMs to generate adversarial prompts that effectively bypass safety filters while generating sensitive images, revealing the safety vulnerabilities within the T2I model. However, existing LLM-based attack methods lack explicit guidance, relying on substantial queries to achieve a successful attack, which limits their practicality in real-world scenarios. In this work, we introduce \textbf{MJA}, a \textbf{m}etaphor-based \textbf{j}ailbreaking \textbf{a}ttack method inspired by the Taboo game, aiming to balance the attack effectiveness and query efficiency by generating metaphor-based adversarial prompts. Specifically, MJA consists of two modules: an LLM-based multi-agent generation module~(MLAG) and an adversarial prompt optimization module~(APO). MLAG decomposes the generation of metaphor-based adversarial prompts into three subtasks: metaphor retrieval, context matching, and adversarial prompt generation. Subsequently, MLAG coordinates three LLM-based agents to generate diverse adversarial prompts by exploring various metaphors and contexts. To enhance the attack efficiency, APO first trains a surrogate model to predict the attack results of adversarial prompts and then designs an acquisition strategy to adaptively identify optimal adversarial prompts. Experiments demonstrate that MJA achieves better attack effectiveness while requiring fewer queries compared to baseline methods. Moreover, our adversarial prompts exhibit strong transferability across various open-source and commercial T2I models. \textcolor{red}{This paper includes model-generated content that may contain offensive or distressing material.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17987v1-abstract-full').style.display = 'none'; document.getElementById('2503.17987v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 page3, 4 figures. This paper includes model-generated content that may contain offensive or distressing material</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17900">arXiv:2503.17900</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.17900">pdf</a>, <a href="https://arxiv.org/format/2503.17900">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MedPlan:A Two-Stage RAG-Based System for Personalized Medical Plan Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+H">Hsin-Ling Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Dao%2C+C">Cong-Tinh Dao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Luning Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shuai%2C+Z">Zitao Shuai</a>, <a href="/search/cs?searchtype=author&amp;query=Phan%2C+T+N+M">Thao Nguyen Minh Phan</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+J">Jun-En Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+C">Chun-Chieh Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiaoxue Han</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chih-Ho Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+D">Dongsheng Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+W">Wen-Chih Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hung%2C+F">Fang-Ming Hung</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chenwei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17900v1-abstract-short" style="display: inline;"> Despite recent success in applying large language models (LLMs) to electronic health records (EHR), most systems focus primarily on assessment rather than treatment planning. We identify three critical limitations in current approaches: they generate treatment plans in a single pass rather than following the sequential reasoning process used by clinicians; they rarely incorporate patient-specific&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17900v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17900v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17900v1-abstract-full" style="display: none;"> Despite recent success in applying large language models (LLMs) to electronic health records (EHR), most systems focus primarily on assessment rather than treatment planning. We identify three critical limitations in current approaches: they generate treatment plans in a single pass rather than following the sequential reasoning process used by clinicians; they rarely incorporate patient-specific historical context; and they fail to effectively distinguish between subjective and objective clinical information. Motivated by the SOAP methodology (Subjective, Objective, Assessment, Plan), we introduce MedPlan, a novel framework that structures LLM reasoning to align with real-life clinician workflows. Our approach employs a two-stage architecture that first generates a clinical assessment based on patient symptoms and objective data, then formulates a structured treatment plan informed by this assessment and enriched with patient-specific information through retrieval-augmented generation. Comprehensive evaluation demonstrates that our method significantly outperforms baseline approaches in both assessment accuracy and treatment plan quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17900v1-abstract-full').style.display = 'none'; document.getElementById('2503.17900v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17551">arXiv:2503.17551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.17551">pdf</a>, <a href="https://arxiv.org/format/2503.17551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Enhanced Vision-Language Modeling with Latent Space Broadening for High Quality Data Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+R">Ruixiao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chunhui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Fangming Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Ze Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+Z">Zhuolin Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Hongyu Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17551v1-abstract-short" style="display: inline;"> Transformer-based multimodal models are widely used in industrial-scale recommendation, search, and advertising systems for content understanding and relevance ranking. Enhancing labeled training data quality and cross-modal fusion significantly improves model performance, influencing key metrics such as quality view rates and ad revenue. High-quality annotations are crucial for advancing content&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17551v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17551v1-abstract-full" style="display: none;"> Transformer-based multimodal models are widely used in industrial-scale recommendation, search, and advertising systems for content understanding and relevance ranking. Enhancing labeled training data quality and cross-modal fusion significantly improves model performance, influencing key metrics such as quality view rates and ad revenue. High-quality annotations are crucial for advancing content modeling, yet traditional statistical-based active learning (AL) methods face limitations: they struggle to detect overconfident misclassifications and are less effective in distinguishing semantically similar items in deep neural networks. Additionally, audio information plays an increasing role, especially in short-video platforms, yet most pre-trained multimodal architectures primarily focus on text and images. While training from scratch across all three modalities is possible, it sacrifices the benefits of leveraging existing pre-trained visual-language (VL) and audio models. To address these challenges, we propose kNN-based Latent Space Broadening (LSB) to enhance AL efficiency and Vision-Language Modeling with Audio Enhancement (VLMAE), a mid-fusion approach integrating audio into VL models. This system deployed in production systems, leading to significant business gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17551v1-abstract-full').style.display = 'none'; document.getElementById('2503.17551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17211">arXiv:2503.17211</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.17211">pdf</a>, <a href="https://arxiv.org/format/2503.17211">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Language Anchor-Guided Method for Robust Noisy Domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Z">Zilin Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lehong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+F">Fangzhou Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhigang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yamada%2C+K+D">Kazunori D Yamada</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Ziming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+W">Wang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17211v1-abstract-short" style="display: inline;"> Real-world machine learning applications often struggle with two major challenges: distribution shift and label noise. Models tend to overfit by focusing on redundant and uninformative features in the training data, which makes it hard for them to generalize to the target domain. Noisy data worsens this problem by causing further overfitting to the noise, meaning that existing methods often fail t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17211v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17211v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17211v1-abstract-full" style="display: none;"> Real-world machine learning applications often struggle with two major challenges: distribution shift and label noise. Models tend to overfit by focusing on redundant and uninformative features in the training data, which makes it hard for them to generalize to the target domain. Noisy data worsens this problem by causing further overfitting to the noise, meaning that existing methods often fail to tell the difference between true, invariant features and misleading, spurious ones. To tackle these issues, we introduce Anchor Alignment and Adaptive Weighting (A3W). This new algorithm uses sample reweighting guided by natural language processing (NLP) anchors to extract more representative features. In simple terms, A3W leverages semantic representations from natural language models as a source of domain-invariant prior knowledge. Additionally, it employs a weighted loss function that adjusts each sample&#39;s contribution based on its similarity to the corresponding NLP anchor. This adjustment makes the model more robust to noisy labels. Extensive experiments on standard benchmark datasets show that A3W consistently outperforms state-of-the-art domain generalization methods, offering significant improvements in both accuracy and robustness across different datasets and noise levels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17211v1-abstract-full').style.display = 'none'; document.getElementById('2503.17211v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16835">arXiv:2503.16835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.16835">pdf</a>, <a href="https://arxiv.org/format/2503.16835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Safe and Reliable Diffusion Models via Subspace Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Huiqiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+T">Tianqing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linlin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+L">Longxiang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Wanlei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16835v1-abstract-short" style="display: inline;"> Large-scale text-to-image (T2I) diffusion models have revolutionized image generation, enabling the synthesis of highly detailed visuals from textual descriptions. However, these models may inadvertently generate inappropriate content, such as copyrighted works or offensive images. While existing methods attempt to eliminate specific unwanted concepts, they often fail to ensure complete removal, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16835v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16835v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16835v1-abstract-full" style="display: none;"> Large-scale text-to-image (T2I) diffusion models have revolutionized image generation, enabling the synthesis of highly detailed visuals from textual descriptions. However, these models may inadvertently generate inappropriate content, such as copyrighted works or offensive images. While existing methods attempt to eliminate specific unwanted concepts, they often fail to ensure complete removal, allowing the concept to reappear in subtle forms. For instance, a model may successfully avoid generating images in Van Gogh&#39;s style when explicitly prompted with &#39;Van Gogh&#39;, yet still reproduce his signature artwork when given the prompt &#39;Starry Night&#39;. In this paper, we propose SAFER, a novel and efficient approach for thoroughly removing target concepts from diffusion models. At a high level, SAFER is inspired by the observed low-dimensional structure of the text embedding space. The method first identifies a concept-specific subspace $S_c$ associated with the target concept c. It then projects the prompt embeddings onto the complementary subspace of $S_c$, effectively erasing the concept from the generated images. Since concepts can be abstract and difficult to fully capture using natural language alone, we employ textual inversion to learn an optimized embedding of the target concept from a reference image. This enables more precise subspace estimation and enhances removal performance. Furthermore, we introduce a subspace expansion strategy to ensure comprehensive and robust concept erasure. Extensive experiments demonstrate that SAFER consistently and effectively erases unwanted concepts from diffusion models while preserving generation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16835v1-abstract-full').style.display = 'none'; document.getElementById('2503.16835v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16553">arXiv:2503.16553</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.16553">pdf</a>, <a href="https://arxiv.org/format/2503.16553">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Foundational individual Mobility Prediction Model based on Open-Source Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zhenlin Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Leizhen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pereira%2C+F+C">Francisco Camara Pereira</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zhenlinag Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16553v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are widely applied to domain-specific tasks due to their massive general knowledge and remarkable inference capacities. Current studies on LLMs have shown immense potential in applying LLMs to model individual mobility prediction problems. However, most LLM-based mobility prediction models only train on specific datasets or use single well-designed prompts, leading to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16553v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16553v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16553v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are widely applied to domain-specific tasks due to their massive general knowledge and remarkable inference capacities. Current studies on LLMs have shown immense potential in applying LLMs to model individual mobility prediction problems. However, most LLM-based mobility prediction models only train on specific datasets or use single well-designed prompts, leading to difficulty in adapting to different cities and users with diverse contexts. To fill these gaps, this paper proposes a unified fine-tuning framework to train a foundational open source LLM-based mobility prediction model. We conducted extensive experiments on six real-world mobility datasets to validate the proposed model. The results showed that the proposed model achieved the best performance in prediction accuracy and transferability over state-of-the-art models based on deep learning and LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16553v1-abstract-full').style.display = 'none'; document.getElementById('2503.16553v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16376">arXiv:2503.16376</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.16376">pdf</a>, <a href="https://arxiv.org/format/2503.16376">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LaPIG: Cross-Modal Generation of Paired Thermal and Visible Facial Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Leyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Joice Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16376v1-abstract-short" style="display: inline;"> The success of modern machine learning, particularly in facial translation networks, is highly dependent on the availability of high-quality, paired, large-scale datasets. However, acquiring sufficient data is often challenging and costly. Inspired by the recent success of diffusion models in high-quality image synthesis and advancements in Large Language Models (LLMs), we propose a novel framewor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16376v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16376v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16376v1-abstract-full" style="display: none;"> The success of modern machine learning, particularly in facial translation networks, is highly dependent on the availability of high-quality, paired, large-scale datasets. However, acquiring sufficient data is often challenging and costly. Inspired by the recent success of diffusion models in high-quality image synthesis and advancements in Large Language Models (LLMs), we propose a novel framework called LLM-assisted Paired Image Generation (LaPIG). This framework enables the construction of comprehensive, high-quality paired visible and thermal images using captions generated by LLMs. Our method encompasses three parts: visible image synthesis with ArcFace embedding, thermal image translation using Latent Diffusion Models (LDMs), and caption generation with LLMs. Our approach not only generates multi-view paired visible and thermal images to increase data diversity but also produces high-quality paired data while maintaining their identity information. We evaluate our method on public datasets by comparing it with existing methods, demonstrating the superiority of LaPIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16376v1-abstract-full').style.display = 'none'; document.getElementById('2503.16376v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16109">arXiv:2503.16109</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.16109">pdf</a>, <a href="https://arxiv.org/format/2503.16109">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICFPT59805.2023.00044">10.1109/ICFPT59805.2023.00044 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DSLUT: An Asymmetric LUT and its Automatic Design Flow Based on Practical Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Moucheng Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+K">Kaixiang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lingli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xuegong Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16109v1-abstract-short" style="display: inline;"> The conventional LUT is redundant since practical functions in real-world benchmarks only occupy a small proportion of all the functions. For example, there are only 3881 out of more than $10^{14}$ NPN classes of 6-input functions occurring in the mapped netlists of the VTR8 and Koios benchmarks. Therefore, we propose a novel LUT-like architecture, named DSLUT, with asymmetric inputs and programma&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16109v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16109v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16109v1-abstract-full" style="display: none;"> The conventional LUT is redundant since practical functions in real-world benchmarks only occupy a small proportion of all the functions. For example, there are only 3881 out of more than $10^{14}$ NPN classes of 6-input functions occurring in the mapped netlists of the VTR8 and Koios benchmarks. Therefore, we propose a novel LUT-like architecture, named DSLUT, with asymmetric inputs and programmable bits to efficiently implement the practical functions in domain-specific benchmarks instead of all the functions. The compact structure of the MUX Tree in the conventional LUT is preserved, while fewer programmable bits are connected to the MUX Tree according to the bit assignment generated by the proposed algorithm. A 6-input DSLUT with 26 SRAM bits is generated for evaluation, which is based on the practical functions of 39 circuits from the VTR8 and Koios benchmarks. After the synthesis flow of ABC, the post-synthesis results show that the proposed DSLUT6 architecture reduces the number of levels by 10.98% at a cost of 7.25% area overhead compared to LUT5 architecture, while LUT6 reduces 15.16% levels at a cost of 51.73% more PLB area. After the full VTR flow, the post-implementation results show that the proposed DSLUT6 can provide performance improvement by 4.59% over LUT5, close to 5.42% of LUT6 over LUT5, causing less area overhead (6.81% of DSLUT6 and 10.93% of LUT6). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16109v1-abstract-full').style.display = 'none'; document.getElementById('2503.16109v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in the Proceedings of the 2023 International Conference on Field Programmable Technology (ICFPT)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2023 International Conference on Field Programmable Technology (ICFPT), Yokohama, Japan, 2023, pp. 278-279 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15934">arXiv:2503.15934</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15934">pdf</a>, <a href="https://arxiv.org/format/2503.15934">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SaMam: Style-aware State Space Model for Arbitrary Image Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongda Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Longguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Z">Ziru Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yulan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15934v1-abstract-short" style="display: inline;"> Global effective receptive field plays a crucial role for image style transfer (ST) to obtain high-quality stylized results. However, existing ST backbones (e.g., CNNs and Transformers) suffer huge computational complexity to achieve global receptive fields. Recently, the State Space Model (SSM), especially the improved variant Mamba, has shown great potential for long-range dependency modeling wi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15934v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15934v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15934v1-abstract-full" style="display: none;"> Global effective receptive field plays a crucial role for image style transfer (ST) to obtain high-quality stylized results. However, existing ST backbones (e.g., CNNs and Transformers) suffer huge computational complexity to achieve global receptive fields. Recently, the State Space Model (SSM), especially the improved variant Mamba, has shown great potential for long-range dependency modeling with linear complexity, which offers a approach to resolve the above dilemma. In this paper, we develop a Mamba-based style transfer framework, termed SaMam. Specifically, a mamba encoder is designed to efficiently extract content and style information. In addition, a style-aware mamba decoder is developed to flexibly adapt to various styles. Moreover, to address the problems of local pixel forgetting, channel redundancy and spatial discontinuity of existing SSMs, we introduce both local enhancement and zigzag scan. Qualitative and quantitative results demonstrate that our SaMam outperforms state-of-the-art methods in terms of both accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15934v1-abstract-full').style.display = 'none'; document.getElementById('2503.15934v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15880">arXiv:2503.15880</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15880">pdf</a>, <a href="https://arxiv.org/format/2503.15880">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> InCo-DPO: Balancing Distribution Shift and Data Quality for Enhanced Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jijie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bo-Wen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liangdong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15880v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) optimizes language models to align with human preferences. Utilizing on-policy samples, generated directly by the policy model, typically results in better performance due to its distribution consistency with the model compared to off-policy samples. This paper identifies the quality of candidate preference samples as another critical factor. While the quality&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15880v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15880v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15880v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) optimizes language models to align with human preferences. Utilizing on-policy samples, generated directly by the policy model, typically results in better performance due to its distribution consistency with the model compared to off-policy samples. This paper identifies the quality of candidate preference samples as another critical factor. While the quality of on-policy data is inherently constrained by the capabilities of the policy model, off-policy data, which can be derived from diverse sources, offers greater potential for quality despite experiencing distribution shifts. However, current research mostly relies on on-policy data and neglects the value of off-policy data in terms of data quality, due to the challenge posed by distribution shift. In this paper, we propose InCo-DPO, an efficient method for synthesizing preference data by integrating on-policy and off-policy data, allowing dynamic adjustments to balance distribution shifts and data quality, thus finding an optimal trade-off. Consequently, InCo-DPO overcomes the limitations of distribution shifts in off-policy data and the quality constraints of on-policy data. We evaluated InCo-DPO with the Alpaca-Eval 2.0 and Arena-Hard benchmarks. Experimental results demonstrate that our approach not only outperforms both on-policy and off-policy data but also achieves a state-of-the-art win rate of 60.8 on Arena-Hard with the vanilla DPO using Gemma-2 model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15880v1-abstract-full').style.display = 'none'; document.getElementById('2503.15880v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15875">arXiv:2503.15875</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15875">pdf</a>, <a href="https://arxiv.org/format/2503.15875">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MiLA: Multi-view Intensive-fidelity Long-term Video Generation World Model for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haiguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Daqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+H">Hongwei Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Haisong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+E">Enhui Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+K">Kaicheng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Limin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15875v1-abstract-short" style="display: inline;"> In recent years, data-driven techniques have greatly advanced autonomous driving systems, but the need for rare and diverse training data remains a challenge, requiring significant investment in equipment and labor. World models, which predict and generate future environmental states, offer a promising solution by synthesizing annotated video data for training. However, existing methods struggle t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15875v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15875v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15875v1-abstract-full" style="display: none;"> In recent years, data-driven techniques have greatly advanced autonomous driving systems, but the need for rare and diverse training data remains a challenge, requiring significant investment in equipment and labor. World models, which predict and generate future environmental states, offer a promising solution by synthesizing annotated video data for training. However, existing methods struggle to generate long, consistent videos without accumulating errors, especially in dynamic scenes. To address this, we propose MiLA, a novel framework for generating high-fidelity, long-duration videos up to one minute. MiLA utilizes a Coarse-to-Re(fine) approach to both stabilize video generation and correct distortion of dynamic objects. Additionally, we introduce a Temporal Progressive Denoising Scheduler and Joint Denoising and Correcting Flow modules to improve the quality of generated videos. Extensive experiments on the nuScenes dataset show that MiLA achieves state-of-the-art performance in video generation quality. For more information, visit the project website: https://github.com/xiaomi-mlab/mila.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15875v1-abstract-full').style.display = 'none'; document.getElementById('2503.15875v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project website: https://github.com/xiaomi-mlab/mila.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15731">arXiv:2503.15731</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15731">pdf</a>, <a href="https://arxiv.org/format/2503.15731">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Graph-Weighted Contrastive Learning for Semi-Supervised Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuqing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Q">Qi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Ligeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+K">Kai Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+K">Kun Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15731v1-abstract-short" style="display: inline;"> Most existing graph-based semi-supervised hyperspectral image classification methods rely on superpixel partitioning techniques. However, they suffer from misclassification of certain pixels due to inaccuracies in superpixel boundaries, \ie, the initial inaccuracies in superpixel partitioning limit overall classification performance. In this paper, we propose a novel graph-weighted contrastive lea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15731v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15731v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15731v1-abstract-full" style="display: none;"> Most existing graph-based semi-supervised hyperspectral image classification methods rely on superpixel partitioning techniques. However, they suffer from misclassification of certain pixels due to inaccuracies in superpixel boundaries, \ie, the initial inaccuracies in superpixel partitioning limit overall classification performance. In this paper, we propose a novel graph-weighted contrastive learning approach that avoids the use of superpixel partitioning and directly employs neural networks to learn hyperspectral image representation. Furthermore, while many approaches require all graph nodes to be available during training, our approach supports mini-batch training by processing only a subset of nodes at a time, reducing computational complexity and improving generalization to unseen nodes. Experimental results on three widely-used datasets demonstrate the effectiveness of the proposed approach compared to baselines relying on superpixel partitioning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15731v1-abstract-full').style.display = 'none'; document.getElementById('2503.15731v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal of Electronic Imaging, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15724">arXiv:2503.15724</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15724">pdf</a>, <a href="https://arxiv.org/format/2503.15724">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reward Training Wheels: Adaptive Auxiliary Rewards for Robotics Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linji Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yuanjie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+X">Xuesu Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15724v1-abstract-short" style="display: inline;"> Robotics Reinforcement Learning (RL) often relies on carefully engineered auxiliary rewards to supplement sparse primary learning objectives to compensate for the lack of large-scale, real-world, trial-and-error data. While these auxiliary rewards accelerate learning, they require significant engineering effort, may introduce human biases, and cannot adapt to the robot&#39;s evolving capabilities duri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15724v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15724v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15724v1-abstract-full" style="display: none;"> Robotics Reinforcement Learning (RL) often relies on carefully engineered auxiliary rewards to supplement sparse primary learning objectives to compensate for the lack of large-scale, real-world, trial-and-error data. While these auxiliary rewards accelerate learning, they require significant engineering effort, may introduce human biases, and cannot adapt to the robot&#39;s evolving capabilities during training. In this paper, we introduce Reward Training Wheels (RTW), a teacher-student framework that automates auxiliary reward adaptation for robotics RL. To be specific, the RTW teacher dynamically adjusts auxiliary reward weights based on the student&#39;s evolving capabilities to determine which auxiliary reward aspects require more or less emphasis to improve the primary objective. We demonstrate RTW on two challenging robot tasks: navigation in highly constrained spaces and off-road vehicle mobility on vertically challenging terrain. In simulation, RTW outperforms expert-designed rewards by 2.35% in navigation success rate and improves off-road mobility performance by 122.62%, while achieving 35% and 3X faster training efficiency, respectively. Physical robot experiments further validate RTW&#39;s effectiveness, achieving a perfect success rate (5/5 trials vs. 2/5 for expert-designed rewards) and improving vehicle stability with up to 47.4% reduction in orientation angles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15724v1-abstract-full').style.display = 'none'; document.getElementById('2503.15724v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15524">arXiv:2503.15524</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15524">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3708359.3712107">10.1145/3708359.3712107 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> KHAIT: K-9 Handler Artificial Intelligence Teaming for Collaborative Sensemaking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wilchek%2C+M">Matthew Wilchek</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linhan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dickinson%2C+S">Sally Dickinson</a>, <a href="/search/cs?searchtype=author&amp;query=Feuerbacher%2C+E">Erica Feuerbacher</a>, <a href="/search/cs?searchtype=author&amp;query=Luther%2C+K">Kurt Luther</a>, <a href="/search/cs?searchtype=author&amp;query=Batarseh%2C+F+A">Feras A. Batarseh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15524v1-abstract-short" style="display: inline;"> In urban search and rescue (USAR) operations, communication between handlers and specially trained canines is crucial but often complicated by challenging environments and the specific behaviors canines are trained to exhibit when detecting a person. Since a USAR canine often works out of sight of the handler, the handler lacks awareness of the canine&#39;s location and situation, known as the &#39;sensem&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15524v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15524v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15524v1-abstract-full" style="display: none;"> In urban search and rescue (USAR) operations, communication between handlers and specially trained canines is crucial but often complicated by challenging environments and the specific behaviors canines are trained to exhibit when detecting a person. Since a USAR canine often works out of sight of the handler, the handler lacks awareness of the canine&#39;s location and situation, known as the &#39;sensemaking gap.&#39; In this paper, we propose KHAIT, a novel approach to close the sensemaking gap and enhance USAR effectiveness by integrating object detection-based Artificial Intelligence (AI) and Augmented Reality (AR). Equipped with AI-powered cameras, edge computing, and AR headsets, KHAIT enables precise and rapid object detection from a canine&#39;s perspective, improving survivor localization. We evaluate this approach in a real-world USAR environment, demonstrating an average survival allocation time decrease of 22%, enhancing the speed and accuracy of operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15524v1-abstract-full').style.display = 'none'; document.getElementById('2503.15524v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures, ACM 30th International Conference on Intelligent User Interfaces (IUI 25)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11; I.4.8; H.5.2; H.5.3; J.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15275">arXiv:2503.15275</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15275">pdf</a>, <a href="https://arxiv.org/format/2503.15275">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Challenges and Trends in Egocentric Vision: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+H">Heqian Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lanxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hanwen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+C">Chenghao Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+L">Linfeng Han</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Huiyu Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hongliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15275v1-abstract-short" style="display: inline;"> With the rapid development of artificial intelligence technologies and wearable devices, egocentric vision understanding has emerged as a new and challenging research direction, gradually attracting widespread attention from both academia and industry. Egocentric vision captures visual and multimodal data through cameras or sensors worn on the human body, offering a unique perspective that simulat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15275v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15275v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15275v1-abstract-full" style="display: none;"> With the rapid development of artificial intelligence technologies and wearable devices, egocentric vision understanding has emerged as a new and challenging research direction, gradually attracting widespread attention from both academia and industry. Egocentric vision captures visual and multimodal data through cameras or sensors worn on the human body, offering a unique perspective that simulates human visual experiences. This paper provides a comprehensive survey of the research on egocentric vision understanding, systematically analyzing the components of egocentric scenes and categorizing the tasks into four main areas: subject understanding, object understanding, environment understanding, and hybrid understanding. We explore in detail the sub-tasks within each category. We also summarize the main challenges and trends currently existing in the field. Furthermore, this paper presents an overview of high-quality egocentric vision datasets, offering valuable resources for future research. By summarizing the latest advancements, we anticipate the broad applications of egocentric vision technologies in fields such as augmented reality, virtual reality, and embodied intelligence, and propose future research directions based on the latest developments in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15275v1-abstract-full').style.display = 'none'; document.getElementById('2503.15275v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14797">arXiv:2503.14797</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.14797">pdf</a>, <a href="https://arxiv.org/format/2503.14797">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FACTS&amp;EVIDENCE: An Interactive Tool for Transparent Fine-Grained Factual Verification of Machine-Generated Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Boonsanong%2C+V">Varich Boonsanong</a>, <a href="/search/cs?searchtype=author&amp;query=Balachandran%2C+V">Vidhisha Balachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiaochuang Han</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+S">Shangbin Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L+L">Lucy Lu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14797v1-abstract-short" style="display: inline;"> With the widespread consumption of AI-generated content, there has been an increased focus on developing automated tools to verify the factual accuracy of such content. However, prior research and tools developed for fact verification treat it as a binary classification or a linear regression problem. Although this is a useful mechanism as part of automatic guardrails in systems, we argue that suc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14797v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14797v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14797v1-abstract-full" style="display: none;"> With the widespread consumption of AI-generated content, there has been an increased focus on developing automated tools to verify the factual accuracy of such content. However, prior research and tools developed for fact verification treat it as a binary classification or a linear regression problem. Although this is a useful mechanism as part of automatic guardrails in systems, we argue that such tools lack transparency in the prediction reasoning and diversity in source evidence to provide a trustworthy user experience. We develop Facts&amp;Evidence - an interactive and transparent tool for user-driven verification of complex text. The tool facilitates the intricate decision-making involved in fact-verification, presenting its users a breakdown of complex input texts to visualize the credibility of individual claims along with an explanation of model decisions and attribution to multiple, diverse evidence sources. Facts&amp;Evidence aims to empower consumers of machine-generated text and give them agency to understand, verify, selectively trust and use such text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14797v1-abstract-full').style.display = 'none'; document.getElementById('2503.14797v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14504">arXiv:2503.14504</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.14504">pdf</a>, <a href="https://arxiv.org/ps/2503.14504">ps</a>, <a href="https://arxiv.org/format/2503.14504">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Aligning Multimodal LLM with Human Preference: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yi-Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junkang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jinda Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xingyu Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dingjie Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tianlong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14504v2-abstract-short" style="display: inline;"> Large language models (LLMs) can handle a wide variety of general tasks with simple prompts, without the need for task-specific training. Multimodal Large Language Models (MLLMs), built upon LLMs, have demonstrated impressive potential in tackling complex tasks involving visual, auditory, and textual data. However, critical issues related to truthfulness, safety, o1-like reasoning, and alignment w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14504v2-abstract-full').style.display = 'inline'; document.getElementById('2503.14504v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14504v2-abstract-full" style="display: none;"> Large language models (LLMs) can handle a wide variety of general tasks with simple prompts, without the need for task-specific training. Multimodal Large Language Models (MLLMs), built upon LLMs, have demonstrated impressive potential in tackling complex tasks involving visual, auditory, and textual data. However, critical issues related to truthfulness, safety, o1-like reasoning, and alignment with human preference remain insufficiently addressed. This gap has spurred the emergence of various alignment algorithms, each targeting different application scenarios and optimization goals. Recent studies have shown that alignment algorithms are a powerful approach to resolving the aforementioned challenges. In this paper, we aim to provide a comprehensive and systematic review of alignment algorithms for MLLMs. Specifically, we explore four key aspects: (1) the application scenarios covered by alignment algorithms, including general image understanding, multi-image, video, and audio, and extended multimodal applications; (2) the core factors in constructing alignment datasets, including data sources, model responses, and preference annotations; (3) the benchmarks used to evaluate alignment algorithms; and (4) a discussion of potential future directions for the development of alignment algorithms. This work seeks to help researchers organize current advancements in the field and inspire better alignment methods. The project page of this paper is available at https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14504v2-abstract-full').style.display = 'none'; document.getElementById('2503.14504v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Alignment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14237">arXiv:2503.14237</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.14237">pdf</a>, <a href="https://arxiv.org/format/2503.14237">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Make Your Training Flexible: Towards Deployment-Efficient Video Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenting Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kunchang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+T">Tianxiang Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+X">Xiangyu Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Limin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14237v1-abstract-short" style="display: inline;"> Popular video training methods mainly operate on a fixed number of tokens sampled from a predetermined spatiotemporal grid, resulting in sub-optimal accuracy-computation trade-offs due to inherent video redundancy. They also lack adaptability to varying computational budgets for downstream tasks, hindering applications of the most competitive model in real-world scenes. We thus propose a new test&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14237v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14237v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14237v1-abstract-full" style="display: none;"> Popular video training methods mainly operate on a fixed number of tokens sampled from a predetermined spatiotemporal grid, resulting in sub-optimal accuracy-computation trade-offs due to inherent video redundancy. They also lack adaptability to varying computational budgets for downstream tasks, hindering applications of the most competitive model in real-world scenes. We thus propose a new test setting, Token Optimization, for maximized input information across budgets, which optimizes the size-limited set of input tokens through token selection from more suitably sampled videos. To this end, we propose a novel augmentation tool termed Flux. By making the sampling grid flexible and leveraging token selection, it is easily adopted in most popular video training frameworks, boosting model robustness with nearly no additional cost. We integrate Flux in large-scale video pre-training, and the resulting FluxViT establishes new state-of-the-art results across extensive tasks at standard costs. Notably, with 1/4 tokens only, it can still match the performance of previous state-of-the-art models with Token Optimization, yielding nearly 90\% savings. All models and data are available at https://github.com/OpenGVLab/FluxViT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14237v1-abstract-full').style.display = 'none'; document.getElementById('2503.14237v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14037">arXiv:2503.14037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.14037">pdf</a>, <a href="https://arxiv.org/format/2503.14037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Intra and Inter Parser-Prompted Transformers for Effective Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liyan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14037v1-abstract-short" style="display: inline;"> We propose Intra and Inter Parser-Prompted Transformers (PPTformer) that explore useful features from visual foundation models for image restoration. Specifically, PPTformer contains two parts: an Image Restoration Network (IRNet) for restoring images from degraded observations and a Parser-Prompted Feature Generation Network (PPFGNet) for providing IRNet with reliable parser information to boost&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14037v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14037v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14037v1-abstract-full" style="display: none;"> We propose Intra and Inter Parser-Prompted Transformers (PPTformer) that explore useful features from visual foundation models for image restoration. Specifically, PPTformer contains two parts: an Image Restoration Network (IRNet) for restoring images from degraded observations and a Parser-Prompted Feature Generation Network (PPFGNet) for providing IRNet with reliable parser information to boost restoration. To enhance the integration of the parser within IRNet, we propose Intra Parser-Prompted Attention (IntraPPA) and Inter Parser-Prompted Attention (InterPPA) to implicitly and explicitly learn useful parser features to facilitate restoration. The IntraPPA re-considers cross attention between parser and restoration features, enabling implicit perception of the parser from a long-range and intra-layer perspective. Conversely, the InterPPA initially fuses restoration features with those of the parser, followed by formulating these fused features within an attention mechanism to explicitly perceive parser information. Further, we propose a parser-prompted feed-forward network to guide restoration within pixel-wise gating modulation. Experimental results show that PPTformer achieves state-of-the-art performance on image deraining, defocus deblurring, desnowing, and low-light enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14037v1-abstract-full').style.display = 'none'; document.getElementById('2503.14037v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This version is accepted by the Association for the Advancement of Artificial Intelligence (AAAI-25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13443">arXiv:2503.13443</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.13443">pdf</a>, <a href="https://arxiv.org/format/2503.13443">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> DPC: Dual-Prompt Collaboration for Tuning Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Jing Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Y">Yan Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+G">Guodong Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13443v1-abstract-short" style="display: inline;"> The Base-New Trade-off (BNT) problem universally exists during the optimization of CLIP-based prompt tuning, where continuous fine-tuning on base (target) classes leads to a simultaneous decrease of generalization ability on new (unseen) classes. Existing approaches attempt to regulate the prompt tuning process to balance BNT by appending constraints. However, imposed on the same target prompt, th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13443v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13443v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13443v1-abstract-full" style="display: none;"> The Base-New Trade-off (BNT) problem universally exists during the optimization of CLIP-based prompt tuning, where continuous fine-tuning on base (target) classes leads to a simultaneous decrease of generalization ability on new (unseen) classes. Existing approaches attempt to regulate the prompt tuning process to balance BNT by appending constraints. However, imposed on the same target prompt, these constraints fail to fully avert the mutual exclusivity between the optimization directions for base and new. As a novel solution to this challenge, we propose the plug-and-play Dual-Prompt Collaboration (DPC) framework, the first that decoupling the optimization processes of base and new tasks at the prompt level. Specifically, we clone a learnable parallel prompt based on the backbone prompt, and introduce a variable Weighting-Decoupling framework to independently control the optimization directions of dual prompts specific to base or new tasks, thus avoiding the conflict in generalization. Meanwhile, we propose a Dynamic Hard Negative Optimizer, utilizing dual prompts to construct a more challenging optimization task on base classes for enhancement. For interpretability, we prove the feature channel invariance of the prompt vector during the optimization process, providing theoretical support for the Weighting-Decoupling of DPC. Extensive experiments on multiple backbones demonstrate that DPC can significantly improve base performance without introducing any external knowledge beyond the base classes, while maintaining generalization to new classes. Code is available at: https://github.com/JREion/DPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13443v1-abstract-full').style.display = 'none'; document.getElementById('2503.13443v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the IEEE/CVF Conference on Computer Vision and Pattern Recognition 2025 (CVPR 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13356">arXiv:2503.13356</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.13356">pdf</a>, <a href="https://arxiv.org/format/2503.13356">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Agents Play Thousands of 3D Video Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhongwen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xianliang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Siyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Q">Qiang Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13356v1-abstract-short" style="display: inline;"> We present PORTAL, a novel framework for developing artificial intelligence agents capable of playing thousands of 3D video games through language-guided policy generation. By transforming decision-making problems into language modeling tasks, our approach leverages large language models (LLMs) to generate behavior trees represented in domain-specific language (DSL). This method eliminates the com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13356v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13356v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13356v1-abstract-full" style="display: none;"> We present PORTAL, a novel framework for developing artificial intelligence agents capable of playing thousands of 3D video games through language-guided policy generation. By transforming decision-making problems into language modeling tasks, our approach leverages large language models (LLMs) to generate behavior trees represented in domain-specific language (DSL). This method eliminates the computational burden associated with traditional reinforcement learning approaches while preserving strategic depth and rapid adaptability. Our framework introduces a hybrid policy structure that combines rule-based nodes with neural network components, enabling both high-level strategic reasoning and precise low-level control. A dual-feedback mechanism incorporating quantitative game metrics and vision-language model analysis facilitates iterative policy improvement at both tactical and strategic levels. The resulting policies are instantaneously deployable, human-interpretable, and capable of generalizing across diverse gaming environments. Experimental results demonstrate PORTAL&#39;s effectiveness across thousands of first-person shooter (FPS) games, showcasing significant improvements in development efficiency, policy generalization, and behavior diversity compared to traditional approaches. PORTAL represents a significant advancement in game AI development, offering a practical solution for creating sophisticated agents that can operate across thousands of commercial video games with minimal development overhead. Experiment results on the 3D video games are best viewed on https://zhongwen.one/projects/portal . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13356v1-abstract-full').style.display = 'none'; document.getElementById('2503.13356v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13026">arXiv:2503.13026</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.13026">pdf</a>, <a href="https://arxiv.org/format/2503.13026">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HiMTok: Learning Hierarchical Mask Tokens for Image Segmentation with Large Multimodal Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+C">Changxu Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lingfeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Senda Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wuyue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13026v1-abstract-short" style="display: inline;"> The remarkable performance of large multimodal models (LMMs) has attracted significant interest from the image segmentation community. To align with the next-token-prediction paradigm, current LMM-driven segmentation methods either use object boundary points to represent masks or introduce special segmentation tokens, whose hidden states are decoded by a segmentation model requiring the original i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13026v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13026v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13026v1-abstract-full" style="display: none;"> The remarkable performance of large multimodal models (LMMs) has attracted significant interest from the image segmentation community. To align with the next-token-prediction paradigm, current LMM-driven segmentation methods either use object boundary points to represent masks or introduce special segmentation tokens, whose hidden states are decoded by a segmentation model requiring the original image as input. However, these approaches often suffer from inadequate mask representation and complex architectures, limiting the potential of LMMs. In this work, we propose the Hierarchical Mask Tokenizer (HiMTok), which represents segmentation masks with up to 32 tokens and eliminates the need for the original image during mask de-tokenization. HiMTok allows for compact and coarse-to-fine mask representations, aligning well with the LLM next-token-prediction paradigm and facilitating the direct acquisition of segmentation capabilities. We develop a 3-stage training recipe for progressive learning of segmentation and visual capabilities, featuring a hierarchical mask loss for effective coarse-to-fine learning. Additionally, we enable bidirectional information flow, allowing conversion between bounding boxes and mask tokens to fully leverage multi-task training potential. Extensive experiments demonstrate that our method achieves state-of-the-art performance across various segmentation tasks,while also enhancing visual grounding and maintaining overall visual understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13026v1-abstract-full').style.display = 'none'; document.getElementById('2503.13026v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13012">arXiv:2503.13012</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.13012">pdf</a>, <a href="https://arxiv.org/format/2503.13012">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Test-Time Domain Generalization via Universe Learning: A Multi-Graph Matching Approach for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xingguo Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xingbo Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liwen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiewen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Pu%2C+B">Bin Pu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zhe Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuejun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13012v1-abstract-short" style="display: inline;"> Despite domain generalization (DG) has significantly addressed the performance degradation of pre-trained models caused by domain shifts, it often falls short in real-world deployment. Test-time adaptation (TTA), which adjusts a learned model using unlabeled test data, presents a promising solution. However, most existing TTA methods struggle to deliver strong performance in medical image segmenta&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13012v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13012v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13012v1-abstract-full" style="display: none;"> Despite domain generalization (DG) has significantly addressed the performance degradation of pre-trained models caused by domain shifts, it often falls short in real-world deployment. Test-time adaptation (TTA), which adjusts a learned model using unlabeled test data, presents a promising solution. However, most existing TTA methods struggle to deliver strong performance in medical image segmentation, primarily because they overlook the crucial prior knowledge inherent to medical images. To address this challenge, we incorporate morphological information and propose a framework based on multi-graph matching. Specifically, we introduce learnable universe embeddings that integrate morphological priors during multi-source training, along with novel unsupervised test-time paradigms for domain adaptation. This approach guarantees cycle-consistency in multi-matching while enabling the model to more effectively capture the invariant priors of unseen data, significantly mitigating the effects of domain shifts. Extensive experiments demonstrate that our method outperforms other state-of-the-art approaches on two medical image segmentation benchmarks for both multi-source and single-source domain generalization tasks. The source code is available at https://github.com/Yore0/TTDG-MGM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13012v1-abstract-full').style.display = 'none'; document.getElementById('2503.13012v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12964">arXiv:2503.12964</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.12964">pdf</a>, <a href="https://arxiv.org/format/2503.12964">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training Video Foundation Models with NVIDIA NeMo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Patel%2C+Z">Zeeshan Patel</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+E">Ethan He</a>, <a href="/search/cs?searchtype=author&amp;query=Mannan%2C+P">Parth Mannan</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xiaowei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+R">Ryan Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+N">Niket Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Huffman%2C+J">Jacob Huffman</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhuoyao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Carl Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+J">Jack Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yan Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+T">Tommy Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Linnan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+S">Sahil Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Ramasamy%2C+S">Shanmugam Ramasamy</a>, <a href="/search/cs?searchtype=author&amp;query=Jennings%2C+J">Joseph Jennings</a>, <a href="/search/cs?searchtype=author&amp;query=Sirazitdinova%2C+E">Ekaterina Sirazitdinova</a>, <a href="/search/cs?searchtype=author&amp;query=Sudakov%2C+O">Oleg Sudakov</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+M">Mingyuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bobby Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+F">Forrest Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sabavat%2C+V+R+N">Vasanth Rao Naik Sabavat</a>, <a href="/search/cs?searchtype=author&amp;query=Niverty%2C+S">Sriharsha Niverty</a>, <a href="/search/cs?searchtype=author&amp;query=Ou%2C+R">Rong Ou</a> , et al. (4 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12964v1-abstract-short" style="display: inline;"> Video Foundation Models (VFMs) have recently been used to simulate the real world to train physical AI systems and develop creative visual experiences. However, there are significant challenges in training large-scale, high quality VFMs that can generate high-quality videos. We present a scalable, open-source VFM training pipeline with NVIDIA NeMo, providing accelerated video dataset curation, mul&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12964v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12964v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12964v1-abstract-full" style="display: none;"> Video Foundation Models (VFMs) have recently been used to simulate the real world to train physical AI systems and develop creative visual experiences. However, there are significant challenges in training large-scale, high quality VFMs that can generate high-quality videos. We present a scalable, open-source VFM training pipeline with NVIDIA NeMo, providing accelerated video dataset curation, multimodal data loading, and parallelized video diffusion model training and inference. We also provide a comprehensive performance analysis highlighting best practices for efficient VFM training and inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12964v1-abstract-full').style.display = 'none'; document.getElementById('2503.12964v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12818">arXiv:2503.12818</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.12818">pdf</a>, <a href="https://arxiv.org/format/2503.12818">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Cross-Layer Security for Semantic Communications: Metrics and Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lingyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Fuhui Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zhijin Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qihui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12818v1-abstract-short" style="display: inline;"> Different from traditional secure communication that focuses on symbolic protection at the physical layer, semantic secure communication requires further attention to semantic-level task performance at the application layer. There is a research gap on how to comprehensively evaluate and optimize the security performance of semantic communication. In order to fill this gap, a unified semantic secur&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12818v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12818v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12818v1-abstract-full" style="display: none;"> Different from traditional secure communication that focuses on symbolic protection at the physical layer, semantic secure communication requires further attention to semantic-level task performance at the application layer. There is a research gap on how to comprehensively evaluate and optimize the security performance of semantic communication. In order to fill this gap, a unified semantic security metric, the cross-layer semantic secure rate (CLSSR), is defined to estimate cross-layer security requirements at both the physical layer and the application layer. Then, we formulate the maximization problem of the CLSSR with the mixed integer nonlinear programming (MINLP). We propose a hierarchical AI-native semantic secure communication network with a reinforcement learning (RL)-based semantic resource allocation scheme, aiming to ensure the cross-layer semantic security (CL-SS). Finally, we prove the convergence of our proposed intelligent resource allocation, and the simulation results demonstrate that our proposed CLSS method outperforms the traditional physical layer semantic security (PL-SS) method in terms of both task reliability and CLSSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12818v1-abstract-full').style.display = 'none'; document.getElementById('2503.12818v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12668">arXiv:2503.12668</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.12668">pdf</a>, <a href="https://arxiv.org/format/2503.12668">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> ZO2: Scalable Zeroth-Order Fine-Tuning for Extremely Large Language Models with Limited GPU Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liangyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Junxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+H">Huanyi Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Keyes%2C+D+E">David E. Keyes</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12668v1-abstract-short" style="display: inline;"> Fine-tuning large pre-trained LLMs generally demands extensive GPU memory. Traditional first-order optimizers like SGD encounter substantial difficulties due to increased memory requirements from storing activations and gradients during both the forward and backward phases as the model size expands. Alternatively, zeroth-order (ZO) techniques can compute gradients using just forward operations, el&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12668v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12668v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12668v1-abstract-full" style="display: none;"> Fine-tuning large pre-trained LLMs generally demands extensive GPU memory. Traditional first-order optimizers like SGD encounter substantial difficulties due to increased memory requirements from storing activations and gradients during both the forward and backward phases as the model size expands. Alternatively, zeroth-order (ZO) techniques can compute gradients using just forward operations, eliminating the need to store activations. Furthermore, by leveraging CPU capabilities, it&#39;s feasible to enhance both the memory and processing power available to a single GPU. We propose a novel framework, ZO2 (Zeroth-Order Offloading), for efficient zeroth-order fine-tuning of LLMs with only limited GPU memory. Our framework dynamically shifts model parameters between the CPU and GPU as required, optimizing computation flow and maximizing GPU usage by minimizing downtime. This integration of parameter adjustments with ZO&#39;s double forward operations reduces unnecessary data movement, enhancing the fine-tuning efficacy. Additionally, our framework supports an innovative low-bit precision approach in AMP mode to streamline data exchanges between the CPU and GPU. Employing this approach allows us to fine-tune extraordinarily large models, such as the OPT-175B with more than 175 billion parameters, on a mere 18GB GPU--achievements beyond the reach of traditional methods. Moreover, our framework achieves these results with almost no additional time overhead and absolutely no accuracy loss compared to standard zeroth-order methods. ZO2&#39;s code has been open-sourced in https://github.com/liangyuwang/zo2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12668v1-abstract-full').style.display = 'none'; document.getElementById('2503.12668v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+L&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10