Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,029 results for author: <span class="mathjax">Huang, W</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Huang%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24361">arXiv:2503.24361</a> <span> [<a href="https://arxiv.org/pdf/2503.24361">pdf</a>, <a href="https://arxiv.org/format/2503.24361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sim-and-Real Co-Training: A Simple Recipe for Vision-Based Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Maddukuri%2C+A">Abhiram Maddukuri</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhenyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L+Y">Lawrence Yunliang Chen</a>, <a href="/search/cs?searchtype=author&query=Nasiriany%2C+S">Soroush Nasiriany</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuqi Xie</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yu Fang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenqi Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhenjia Xu</a>, <a href="/search/cs?searchtype=author&query=Chernyadev%2C+N">Nikita Chernyadev</a>, <a href="/search/cs?searchtype=author&query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a>, <a href="/search/cs?searchtype=author&query=Mandlekar%2C+A">Ajay Mandlekar</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Linxi Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24361v1-abstract-short" style="display: inline;"> Large real-world robot datasets hold great potential to train generalist robot models, but scaling real-world human data collection is time-consuming and resource-intensive. Simulation has great potential in supplementing large-scale data, especially with recent advances in generative AI and automated data generation tools that enable scalable creation of robot behavior datasets. However, training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24361v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24361v1-abstract-full" style="display: none;"> Large real-world robot datasets hold great potential to train generalist robot models, but scaling real-world human data collection is time-consuming and resource-intensive. Simulation has great potential in supplementing large-scale data, especially with recent advances in generative AI and automated data generation tools that enable scalable creation of robot behavior datasets. However, training a policy solely in simulation and transferring it to the real world often demands substantial human effort to bridge the reality gap. A compelling alternative is to co-train the policy on a mixture of simulation and real-world datasets. Preliminary studies have recently shown this strategy to substantially improve the performance of a policy over one trained on a limited amount of real-world data. Nonetheless, the community lacks a systematic understanding of sim-and-real co-training and what it takes to reap the benefits of simulation data for real-robot learning. This work presents a simple yet effective recipe for utilizing simulation data to solve vision-based robotic manipulation tasks. We derive this recipe from comprehensive experiments that validate the co-training strategy on various simulation and real-world datasets. Using two domains--a robot arm and a humanoid--across diverse tasks, we demonstrate that simulation data can enhance real-world task performance by an average of 38%, even with notable differences between the simulation and real-world data. Videos and additional results can be found at https://co-training.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24361v1-abstract-full').style.display = 'none'; document.getElementById('2503.24361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://co-training.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22747">arXiv:2503.22747</a> <span> [<a href="https://arxiv.org/pdf/2503.22747">pdf</a>, <a href="https://arxiv.org/format/2503.22747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> LeForecast: Enterprise Hybrid Forecast by Time Series Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zheng Tan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yiwen Nie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenfa Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanze Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xinyuan Tian</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kailin Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengya Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qijiang Cheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haipeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yingzheng Ma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuci Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuanyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+X">Xiangyu Lei</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiyu Guan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wanqing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shouming Liu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangquan Meng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+P">Pengzhan Qu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chao Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiaxuan Fan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuan He</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Hongsheng Qi</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yangzhou Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22747v1-abstract-short" style="display: inline;"> Demand is spiking in industrial fields for multidisciplinary forecasting, where a broad spectrum of sectors needs planning and forecasts to streamline intelligent business management, such as demand forecasting, product planning, inventory optimization, etc. Specifically, these tasks expecting intelligent approaches to learn from sequentially collected historical data and then foresee most possibl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22747v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22747v1-abstract-full" style="display: none;"> Demand is spiking in industrial fields for multidisciplinary forecasting, where a broad spectrum of sectors needs planning and forecasts to streamline intelligent business management, such as demand forecasting, product planning, inventory optimization, etc. Specifically, these tasks expecting intelligent approaches to learn from sequentially collected historical data and then foresee most possible trend, i.e. time series forecasting. Challenge of it lies in interpreting complex business contexts and the efficiency and generalisation of modelling. With aspirations of pre-trained foundational models for such purpose, given their remarkable success of large foundation model across legions of tasks, we disseminate \leforecast{}, an enterprise intelligence platform tailored for time series tasks. It integrates advanced interpretations of time series data and multi-source information, and a three-pillar modelling engine combining a large foundation model (Le-TSFM), multimodal model and hybrid model to derive insights, predict or infer futures, and then drive optimisation across multiple sectors in enterprise operations. The framework is composed by a model pool, model profiling module, and two different fusion approaches regarding original model architectures. Experimental results verify the efficiency of our trail fusion concepts: router-based fusion network and coordination of large and small models, resulting in high costs for redundant development and maintenance of models. This work reviews deployment of LeForecast and its performance in three industrial use cases. Our comprehensive experiments indicate that LeForecast is a profound and practical platform for efficient and competitive performance. And we do hope that this work can enlighten the research and grounding of time series techniques in accelerating enterprise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22747v1-abstract-full').style.display = 'none'; document.getElementById('2503.22747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22405">arXiv:2503.22405</a> <span> [<a href="https://arxiv.org/pdf/2503.22405">pdf</a>, <a href="https://arxiv.org/format/2503.22405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modeling Multiple Normal Action Representations for Error Detection in Procedural Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei-Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan-Ming Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhi-Wei Xia</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yu-Ming Tang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kun-Yu Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jian-Fang Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei-Shi Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22405v1-abstract-short" style="display: inline;"> Error detection in procedural activities is essential for consistent and correct outcomes in AR-assisted and robotic systems. Existing methods often focus on temporal ordering errors or rely on static prototypes to represent normal actions. However, these approaches typically overlook the common scenario where multiple, distinct actions are valid following a given sequence of executed actions. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22405v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22405v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22405v1-abstract-full" style="display: none;"> Error detection in procedural activities is essential for consistent and correct outcomes in AR-assisted and robotic systems. Existing methods often focus on temporal ordering errors or rely on static prototypes to represent normal actions. However, these approaches typically overlook the common scenario where multiple, distinct actions are valid following a given sequence of executed actions. This leads to two issues: (1) the model cannot effectively detect errors using static prototypes when the inference environment or action execution distribution differs from training; and (2) the model may also use the wrong prototypes to detect errors if the ongoing action label is not the same as the predicted one. To address this problem, we propose an Adaptive Multiple Normal Action Representation (AMNAR) framework. AMNAR predicts all valid next actions and reconstructs their corresponding normal action representations, which are compared against the ongoing action to detect errors. Extensive experiments demonstrate that AMNAR achieves state-of-the-art performance, highlighting the effectiveness of AMNAR and the importance of modeling multiple valid next actions in error detection. The code is available at https://github.com/iSEE-Laboratory/AMNAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22405v1-abstract-full').style.display = 'none'; document.getElementById('2503.22405v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22359">arXiv:2503.22359</a> <span> [<a href="https://arxiv.org/pdf/2503.22359">pdf</a>, <a href="https://arxiv.org/format/2503.22359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Knowledge Discrepancies among Multiple Datasets for Task-agnostic Unified Face Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiahao Xia</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Min Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjian Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haimin Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chunxia Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22359v1-abstract-short" style="display: inline;"> Despite the similar structures of human faces, existing face alignment methods cannot learn unified knowledge from multiple datasets with different landmark annotations. The limited training samples in a single dataset commonly result in fragile robustness in this field. To mitigate knowledge discrepancies among different datasets and train a task-agnostic unified face alignment (TUFA) framework,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22359v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22359v1-abstract-full" style="display: none;"> Despite the similar structures of human faces, existing face alignment methods cannot learn unified knowledge from multiple datasets with different landmark annotations. The limited training samples in a single dataset commonly result in fragile robustness in this field. To mitigate knowledge discrepancies among different datasets and train a task-agnostic unified face alignment (TUFA) framework, this paper presents a strategy to unify knowledge from multiple datasets. Specifically, we calculate a mean face shape for each dataset. To explicitly align these mean shapes on an interpretable plane based on their semantics, each shape is then incorporated with a group of semantic alignment embeddings. The 2D coordinates of these aligned shapes can be viewed as the anchors of the plane. By encoding them into structure prompts and further regressing the corresponding facial landmarks using image features, a mapping from the plane to the target faces is finally established, which unifies the learning target of different datasets. Consequently, multiple datasets can be utilized to boost the generalization ability of the model. The successful mitigation of discrepancies also enhances the efficiency of knowledge transferring to a novel dataset, significantly boosts the performance of few-shot face alignment. Additionally, the interpretable plane endows TUFA with a task-agnostic characteristic, enabling it to locate landmarks unseen during training in a zero-shot manner. Extensive experiments are carried on seven benchmarks and the results demonstrate an impressive improvement in face alignment brought by knowledge discrepancies mitigation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22359v1-abstract-full').style.display = 'none'; document.getElementById('2503.22359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 Pages, 9 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22050">arXiv:2503.22050</a> <span> [<a href="https://arxiv.org/pdf/2503.22050">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Deep Learning Framework for Boundary-Aware Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+T">Tai An</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Da Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qingyuan He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiacheng Hu</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Y">Yujia Lou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22050v1-abstract-short" style="display: inline;"> As a fundamental task in computer vision, semantic segmentation is widely applied in fields such as autonomous driving, remote sensing image analysis, and medical image processing. In recent years, Transformer-based segmentation methods have demonstrated strong performance in global feature modeling. However, they still struggle with blurred target boundaries and insufficient recognition of small… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22050v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22050v1-abstract-full" style="display: none;"> As a fundamental task in computer vision, semantic segmentation is widely applied in fields such as autonomous driving, remote sensing image analysis, and medical image processing. In recent years, Transformer-based segmentation methods have demonstrated strong performance in global feature modeling. However, they still struggle with blurred target boundaries and insufficient recognition of small targets. To address these issues, this study proposes a Mask2Former-based semantic segmentation algorithm incorporating a boundary enhancement feature bridging module (BEFBM). The goal is to improve target boundary accuracy and segmentation consistency. Built upon the Mask2Former framework, this method constructs a boundary-aware feature map and introduces a feature bridging mechanism. This enables effective cross-scale feature fusion, enhancing the model's ability to focus on target boundaries. Experiments on the Cityscapes dataset demonstrate that, compared to mainstream segmentation methods, the proposed approach achieves significant improvements in metrics such as mIOU, mDICE, and mRecall. It also exhibits superior boundary retention in complex scenes. Visual analysis further confirms the model's advantages in fine-grained regions. Future research will focus on optimizing computational efficiency and exploring its potential in other high-precision segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22050v1-abstract-full').style.display = 'none'; document.getElementById('2503.22050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20825">arXiv:2503.20825</a> <span> [<a href="https://arxiv.org/pdf/2503.20825">pdf</a>, <a href="https://arxiv.org/format/2503.20825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Debiasing Kernel-Based Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tian Qin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei-Min Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20825v1-abstract-short" style="display: inline;"> We propose a novel two-stage framework of generative models named Debiasing Kernel-Based Generative Models (DKGM) with the insights from kernel density estimation (KDE) and stochastic approximation. In the first stage of DKGM, we employ KDE to bypass the obstacles in estimating the density of data without losing too much image quality. One characteristic of KDE is oversmoothing, which makes the ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20825v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20825v1-abstract-full" style="display: none;"> We propose a novel two-stage framework of generative models named Debiasing Kernel-Based Generative Models (DKGM) with the insights from kernel density estimation (KDE) and stochastic approximation. In the first stage of DKGM, we employ KDE to bypass the obstacles in estimating the density of data without losing too much image quality. One characteristic of KDE is oversmoothing, which makes the generated image blurry. Therefore, in the second stage, we formulate the process of reducing the blurriness of images as a statistical debiasing problem and develop a novel iterative algorithm to improve image quality, which is inspired by the stochastic approximation. Extensive experiments illustrate that the image quality of DKGM on CIFAR10 is comparable to state-of-the-art models such as diffusion models and GAN models. The performance of DKGM on CelebA 128x128 and LSUN (Church) 128x128 is also competitive. We conduct extra experiments to exploit how the bandwidth in KDE affects the sample diversity and debiasing effect of DKGM. The connections between DKGM and score-based models are also discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20825v1-abstract-full').style.display = 'none'; document.getElementById('2503.20825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20527">arXiv:2503.20527</a> <span> [<a href="https://arxiv.org/pdf/2503.20527">pdf</a>, <a href="https://arxiv.org/format/2503.20527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StableToolBench-MirrorAPI: Modeling Tool Environments as Mirrors of 7,000+ Real-World APIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhicheng Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sijie Cheng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yuchen Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sicheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20527v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has spurred significant interest in tool learning, where LLMs are augmented with external tools to tackle complex tasks. However, existing tool environments face challenges in balancing stability, scalability, and realness, particularly for benchmarking purposes. To address this problem, we propose MirrorAPI, a novel framework that trains speci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20527v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20527v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has spurred significant interest in tool learning, where LLMs are augmented with external tools to tackle complex tasks. However, existing tool environments face challenges in balancing stability, scalability, and realness, particularly for benchmarking purposes. To address this problem, we propose MirrorAPI, a novel framework that trains specialized LLMs to accurately simulate real API responses, effectively acting as "mirrors" to tool environments. Using a comprehensive dataset of request-response pairs from 7,000+ APIs, we employ supervised fine-tuning and chain-of-thought reasoning to enhance simulation fidelity. MirrorAPI achieves superior accuracy and stability compared to state-of-the-art methods, as demonstrated by its performance on the newly constructed MirrorAPI-Bench and its integration into StableToolBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20527v1-abstract-full').style.display = 'none'; document.getElementById('2503.20527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20315">arXiv:2503.20315</a> <span> [<a href="https://arxiv.org/pdf/2503.20315">pdf</a>, <a href="https://arxiv.org/format/2503.20315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SpikeDerain: Unveiling Clear Videos from Rainy Sequences Using Color Spike Streams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hanwen Liang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yajing Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenxin Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhaofei Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiejun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20315v1-abstract-short" style="display: inline;"> Restoring clear frames from rainy videos presents a significant challenge due to the rapid motion of rain streaks. Traditional frame-based visual sensors, which capture scene content synchronously, struggle to capture the fast-moving details of rain accurately. In recent years, neuromorphic sensors have introduced a new paradigm for dynamic scene perception, offering microsecond temporal resolutio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20315v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20315v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20315v1-abstract-full" style="display: none;"> Restoring clear frames from rainy videos presents a significant challenge due to the rapid motion of rain streaks. Traditional frame-based visual sensors, which capture scene content synchronously, struggle to capture the fast-moving details of rain accurately. In recent years, neuromorphic sensors have introduced a new paradigm for dynamic scene perception, offering microsecond temporal resolution and high dynamic range. However, existing multimodal methods that fuse event streams with RGB images face difficulties in handling the complex spatiotemporal interference of raindrops in real scenes, primarily due to hardware synchronization errors and computational redundancy. In this paper, we propose a Color Spike Stream Deraining Network (SpikeDerain), capable of reconstructing spike streams of dynamic scenes and accurately removing rain streaks. To address the challenges of data scarcity in real continuous rainfall scenes, we design a physically interpretable rain streak synthesis model that generates parameterized continuous rain patterns based on arbitrary background images. Experimental results demonstrate that the network, trained with this synthetic data, remains highly robust even under extreme rainfall conditions. These findings highlight the effectiveness and robustness of our method across varying rainfall levels and datasets, setting new standards for video deraining tasks. The code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20315v1-abstract-full').style.display = 'none'; document.getElementById('2503.20315v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19300">arXiv:2503.19300</a> <span> [<a href="https://arxiv.org/pdf/2503.19300">pdf</a>, <a href="https://arxiv.org/format/2503.19300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> UniMoMo: Unified Generative Modeling of 3D Molecules for De Novo Binder Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xiangzhe Kong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zishen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziting Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+R">Rui Jiao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19300v1-abstract-short" style="display: inline;"> The design of target-specific molecules such as small molecules, peptides, and antibodies is vital for biological research and drug discovery. Existing generative methods are restricted to single-domain molecules, failing to address versatile therapeutic needs or utilize cross-domain transferability to enhance model performance. In this paper, we introduce Unified generative Modeling of 3D Molecul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19300v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19300v1-abstract-full" style="display: none;"> The design of target-specific molecules such as small molecules, peptides, and antibodies is vital for biological research and drug discovery. Existing generative methods are restricted to single-domain molecules, failing to address versatile therapeutic needs or utilize cross-domain transferability to enhance model performance. In this paper, we introduce Unified generative Modeling of 3D Molecules (UniMoMo), the first framework capable of designing binders of multiple molecular domains using a single model. In particular, UniMoMo unifies the representations of different molecules as graphs of blocks, where each block corresponds to either a standard amino acid or a molecular fragment. Based on these unified representations, UniMoMo utilizes a geometric latent diffusion model for 3D molecular generation, featuring an iterative full-atom autoencoder to compress blocks into latent space points, followed by an E(3)-equivariant diffusion process. Extensive benchmarks across peptides, antibodies, and small molecules demonstrate the superiority of our unified framework over existing domain-specific models, highlighting the benefits of multi-domain training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19300v1-abstract-full').style.display = 'none'; document.getElementById('2503.19300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18486">arXiv:2503.18486</a> <span> [<a href="https://arxiv.org/pdf/2503.18486">pdf</a>, <a href="https://arxiv.org/format/2503.18486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Similarity Representation Learning Focusing on Individual Instruments with Source Separation and Human Preference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imamura%2C+T">Takehiro Imamura</a>, <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18486v1-abstract-short" style="display: inline;"> This paper proposes music similarity representation learning (MSRL) based on individual instrument sounds (InMSRL) utilizing music source separation (MSS) and human preference without requiring clean instrument sounds during inference. We propose three methods that effectively improve performance. First, we introduce end-to-end fine-tuning (E2E-FT) for the Cascade approach that sequentially perfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18486v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18486v1-abstract-full" style="display: none;"> This paper proposes music similarity representation learning (MSRL) based on individual instrument sounds (InMSRL) utilizing music source separation (MSS) and human preference without requiring clean instrument sounds during inference. We propose three methods that effectively improve performance. First, we introduce end-to-end fine-tuning (E2E-FT) for the Cascade approach that sequentially performs MSS and music similarity feature extraction. E2E-FT allows the model to minimize the adverse effects of a separation error on the feature extraction. Second, we propose multi-task learning for the Direct approach that directly extracts disentangled music similarity features using a single music similarity feature extractor. Multi-task learning, which is based on the disentangled music similarity feature extraction and MSS based on reconstruction with disentangled music similarity features, further enhances instrument feature disentanglement. Third, we employ perception-aware fine-tuning (PAFT). PAFT utilizes human preference, allowing the model to perform InMSRL aligned with human perceptual similarity. We conduct experimental evaluations and demonstrate that 1) E2E-FT for Cascade significantly improves InMSRL performance, 2) the multi-task learning for Direct is also helpful to improve disentanglement performance in the feature extraction, 3) PAFT significantly enhances the perceptual InMSRL performance, and 4) Cascade with E2E-FT and PAFT outperforms Direct with the multi-task learning and PAFT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18486v1-abstract-full').style.display = 'none'; document.getElementById('2503.18486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18245">arXiv:2503.18245</a> <span> [<a href="https://arxiv.org/pdf/2503.18245">pdf</a>, <a href="https://arxiv.org/format/2503.18245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffGED: Computing Graph Edit Distance via Diffusion-based Graph Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanchen Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dong Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuemin Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18245v1-abstract-short" style="display: inline;"> The Graph Edit Distance (GED) problem, which aims to compute the minimum number of edit operations required to transform one graph into another, is a fundamental challenge in graph analysis with wide-ranging applications. However, due to its NP-hard nature, traditional A* approaches often suffer from scalability issue, making them computationally intractable for large graphs. Many recent deep lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18245v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18245v1-abstract-full" style="display: none;"> The Graph Edit Distance (GED) problem, which aims to compute the minimum number of edit operations required to transform one graph into another, is a fundamental challenge in graph analysis with wide-ranging applications. However, due to its NP-hard nature, traditional A* approaches often suffer from scalability issue, making them computationally intractable for large graphs. Many recent deep learning frameworks address GED by formulating it as a regression task, which, while efficient, fails to recover the edit path -- a central interest in GED. Furthermore, recent hybrid approaches that combine deep learning with traditional methods to recover the edit path often yield poor solution quality. These methods also struggle to generate candidate solutions in parallel, resulting in increased running times.In this paper, we present a novel approach, DiffGED, that leverages generative diffusion model to solve GED and recover the corresponding edit path. Specifically, we first generate multiple diverse node matching matrices in parallel through a diffusion-based graph matching model. Next, node mappings are extracted from each generated matching matrices in parallel, and each extracted node mapping can be simply transformed into an edit path. Benefiting from the generative diversity provided by the diffusion model, DiffGED is less likely to fall into local sub-optimal solutions, thereby achieving superior overall solution quality close to the exact solution. Experimental results on real-world datasets demonstrate that DiffGED can generate multiple diverse edit paths with exceptionally high accuracy comparable to exact solutions while maintaining a running time shorter than most of hybrid approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18245v1-abstract-full').style.display = 'none'; document.getElementById('2503.18245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17407">arXiv:2503.17407</a> <span> [<a href="https://arxiv.org/pdf/2503.17407">pdf</a>, <a href="https://arxiv.org/format/2503.17407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on Long Context Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhiqi Bai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Huanxuan Liao</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiebin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yong Shan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiayi Tian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhejian Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shizhu He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhoujun Li</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17407v1-abstract-short" style="display: inline;"> Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) that can process and analyze extensive inputs in an effective and efficient way. In this paper, we present a comprehensive survey on recent advances in long-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17407v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17407v1-abstract-full" style="display: none;"> Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) that can process and analyze extensive inputs in an effective and efficient way. In this paper, we present a comprehensive survey on recent advances in long-context modeling for large language models. Our survey is structured around three key aspects: how to obtain effective and efficient LCLMs, how to train and deploy LCLMs efficiently, and how to evaluate and analyze LCLMs comprehensively. For the first aspect, we discuss data strategies, architectural designs, and workflow approaches oriented with long context processing. For the second aspect, we provide a detailed examination of the infrastructure required for LCLM training and inference. For the third aspect, we present evaluation paradigms for long-context comprehension and long-form generation, as well as behavioral analysis and mechanism interpretability of LCLMs. Beyond these three key aspects, we thoroughly explore the diverse application scenarios where existing LCLMs have been deployed and outline promising future development directions. This survey provides an up-to-date review of the literature on long-context LLMs, which we wish to serve as a valuable resource for both researchers and engineers. An associated GitHub repository collecting the latest papers and repos is available at: \href{https://github.com/LCLM-Horizon/A-Comprehensive-Survey-For-Long-Context-Language-Modeling}{\color[RGB]{175,36,67}{LCLM-Horizon}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17407v1-abstract-full').style.display = 'none'; document.getElementById('2503.17407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17398">arXiv:2503.17398</a> <span> [<a href="https://arxiv.org/pdf/2503.17398">pdf</a>, <a href="https://arxiv.org/format/2503.17398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Reachable Sets-based Trajectory Planning Combining Reinforcement Learning and iLQR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjie Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shijie Yuan</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+J">Jingjia Teng</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Hongmao Qin</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+Y">Yougang Bian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17398v1-abstract-short" style="display: inline;"> The driving risk field is applicable to more complex driving scenarios, providing new approaches for safety decision-making and active vehicle control in intricate environments. However, existing research often overlooks the driving risk field and fails to consider the impact of risk distribution within drivable areas on trajectory planning, which poses challenges for enhancing safety. This paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17398v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17398v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17398v1-abstract-full" style="display: none;"> The driving risk field is applicable to more complex driving scenarios, providing new approaches for safety decision-making and active vehicle control in intricate environments. However, existing research often overlooks the driving risk field and fails to consider the impact of risk distribution within drivable areas on trajectory planning, which poses challenges for enhancing safety. This paper proposes a trajectory planning method for intelligent vehicles based on the risk reachable set to further improve the safety of trajectory planning. First, we construct the reachable set incorporating the driving risk field to more accurately assess and avoid potential risks in drivable areas. Then, the initial trajectory is generated based on safe reinforcement learning and projected onto the reachable set. Finally, we introduce a trajectory planning method based on a constrained iterative quadratic regulator to optimize the initial solution, ensuring that the planned trajectory achieves optimal comfort, safety, and efficiency. We conduct simulation tests of trajectory planning in high-speed lane-changing scenarios. The results indicate that the proposed method can guarantee trajectory comfort and driving efficiency, with the generated trajectory situated outside high-risk boundaries, thereby ensuring vehicle safety during operation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17398v1-abstract-full').style.display = 'none'; document.getElementById('2503.17398v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16843">arXiv:2503.16843</a> <span> [<a href="https://arxiv.org/pdf/2503.16843">pdf</a>, <a href="https://arxiv.org/format/2503.16843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoRASculpt: Sculpting LoRA for Harmonizing General and Specialized Knowledge in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jian Liang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qu Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Mang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16843v1-abstract-short" style="display: inline;"> While Multimodal Large Language Models (MLLMs) excel at generalizing across modalities and tasks, effectively adapting them to specific downstream tasks while simultaneously retaining both general and specialized knowledge remains challenging. Although Low-Rank Adaptation (LoRA) is widely used to efficiently acquire specialized knowledge in MLLMs, it introduces substantial harmful redundancy durin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16843v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16843v1-abstract-full" style="display: none;"> While Multimodal Large Language Models (MLLMs) excel at generalizing across modalities and tasks, effectively adapting them to specific downstream tasks while simultaneously retaining both general and specialized knowledge remains challenging. Although Low-Rank Adaptation (LoRA) is widely used to efficiently acquire specialized knowledge in MLLMs, it introduces substantial harmful redundancy during visual instruction tuning, which exacerbates the forgetting of general knowledge and degrades downstream task performance. To address this issue, we propose LoRASculpt to eliminate harmful redundant parameters, thereby harmonizing general and specialized knowledge. Specifically, under theoretical guarantees, we introduce sparse updates into LoRA to discard redundant parameters effectively. Furthermore, we propose a Conflict Mitigation Regularizer to refine the update trajectory of LoRA, mitigating knowledge conflicts with the pretrained weights. Extensive experimental results demonstrate that even at very high degree of sparsity ($\le$ 5%), our method simultaneously enhances generalization and downstream task performance. This confirms that our approach effectively mitigates the catastrophic forgetting issue and further promotes knowledge harmonization in MLLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16843v1-abstract-full').style.display = 'none'; document.getElementById('2503.16843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16451">arXiv:2503.16451</a> <span> [<a href="https://arxiv.org/pdf/2503.16451">pdf</a>, <a href="https://arxiv.org/format/2503.16451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Think-Then-React: Towards Unconstrained Human Action-to-Reaction Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wenhui Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyuan Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuhao Jin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiting Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+R">Ruihua Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16451v1-abstract-short" style="display: inline;"> Modeling human-like action-to-reaction generation has significant real-world applications, like human-robot interaction and games. Despite recent advancements in single-person motion generation, it is still challenging to well handle action-to-reaction generation, due to the difficulty of directly predicting reaction from action sequence without prompts, and the absence of a unified representation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16451v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16451v1-abstract-full" style="display: none;"> Modeling human-like action-to-reaction generation has significant real-world applications, like human-robot interaction and games. Despite recent advancements in single-person motion generation, it is still challenging to well handle action-to-reaction generation, due to the difficulty of directly predicting reaction from action sequence without prompts, and the absence of a unified representation that effectively encodes multi-person motion. To address these challenges, we introduce Think-Then-React (TTR), a large language-model-based framework designed to generate human-like reactions. First, with our fine-grained multimodal training strategy, TTR is capable to unify two processes during inference: a thinking process that explicitly infers action intentions and reasons corresponding reaction description, which serve as semantic prompts, and a reacting process that predicts reactions based on input action and the inferred semantic prompts. Second, to effectively represent multi-person motion in language models, we propose a unified motion tokenizer by decoupling egocentric pose and absolute space features, which effectively represents action and reaction motion with same encoding. Extensive experiments demonstrate that TTR outperforms existing baselines, achieving significant improvements in evaluation metrics, such as reducing FID from 3.988 to 1.942. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16451v1-abstract-full').style.display = 'none'; document.getElementById('2503.16451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15870">arXiv:2503.15870</a> <span> [<a href="https://arxiv.org/pdf/2503.15870">pdf</a>, <a href="https://arxiv.org/format/2503.15870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedSAF: A Federated Learning Framework for Enhanced Gastric Cancer Detection and Privacy Preservation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yuxin Miao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hongda Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yichun Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yishu Hong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiechen Guo</a>, <a href="/search/cs?searchtype=author&query=Braytee%2C+A">Ali Braytee</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weidong Huang</a>, <a href="/search/cs?searchtype=author&query=Anaissi%2C+A">Ali Anaissi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15870v1-abstract-short" style="display: inline;"> Gastric cancer is one of the most commonly diagnosed cancers and has a high mortality rate. Due to limited medical resources, developing machine learning models for gastric cancer recognition provides an efficient solution for medical institutions. However, such models typically require large sample sizes for training and testing, which can challenge patient privacy. Federated learning offers an e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15870v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15870v1-abstract-full" style="display: none;"> Gastric cancer is one of the most commonly diagnosed cancers and has a high mortality rate. Due to limited medical resources, developing machine learning models for gastric cancer recognition provides an efficient solution for medical institutions. However, such models typically require large sample sizes for training and testing, which can challenge patient privacy. Federated learning offers an effective alternative by enabling model training across multiple institutions without sharing sensitive patient data. This paper addresses the limited sample size of publicly available gastric cancer data with a modified data processing method. This paper introduces FedSAF, a novel federated learning algorithm designed to improve the performance of existing methods, particularly in non-independent and identically distributed (non-IID) data scenarios. FedSAF incorporates attention-based message passing and the Fisher Information Matrix to enhance model accuracy, while a model splitting function reduces computation and transmission costs. Hyperparameter tuning and ablation studies demonstrate the effectiveness of this new algorithm, showing improvements in test accuracy on gastric cancer datasets, with FedSAF outperforming existing federated learning methods like FedAMP, FedAvg, and FedProx. The framework's robustness and generalization ability were further validated across additional datasets (SEED, BOT, FashionMNIST, and CIFAR-10), achieving high performance in diverse environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15870v1-abstract-full').style.display = 'none'; document.getElementById('2503.15870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15126">arXiv:2503.15126</a> <span> [<a href="https://arxiv.org/pdf/2503.15126">pdf</a>, <a href="https://arxiv.org/format/2503.15126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text-Derived Relational Graph-Enhanced Network for Skeleton-Based Action Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+H">Haoyu Ji</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bowen Chen</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Weihong Ren</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenze Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhihao Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Honghai Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15126v1-abstract-short" style="display: inline;"> Skeleton-based Temporal Action Segmentation (STAS) aims to segment and recognize various actions from long, untrimmed sequences of human skeletal movements. Current STAS methods typically employ spatio-temporal modeling to establish dependencies among joints as well as frames, and utilize one-hot encoding with cross-entropy loss for frame-wise classification supervision. However, these methods ove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15126v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15126v1-abstract-full" style="display: none;"> Skeleton-based Temporal Action Segmentation (STAS) aims to segment and recognize various actions from long, untrimmed sequences of human skeletal movements. Current STAS methods typically employ spatio-temporal modeling to establish dependencies among joints as well as frames, and utilize one-hot encoding with cross-entropy loss for frame-wise classification supervision. However, these methods overlook the intrinsic correlations among joints and actions within skeletal features, leading to a limited understanding of human movements. To address this, we propose a Text-Derived Relational Graph-Enhanced Network (TRG-Net) that leverages prior graphs generated by Large Language Models (LLM) to enhance both modeling and supervision. For modeling, the Dynamic Spatio-Temporal Fusion Modeling (DSFM) method incorporates Text-Derived Joint Graphs (TJG) with channel- and frame-level dynamic adaptation to effectively model spatial relations, while integrating spatio-temporal core features during temporal modeling. For supervision, the Absolute-Relative Inter-Class Supervision (ARIS) method employs contrastive learning between action features and text embeddings to regularize the absolute class distributions, and utilizes Text-Derived Action Graphs (TAG) to capture the relative inter-class relationships among action features. Additionally, we propose a Spatial-Aware Enhancement Processing (SAEP) method, which incorporates random joint occlusion and axial rotation to enhance spatial generalization. Performance evaluations on four public datasets demonstrate that TRG-Net achieves state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15126v1-abstract-full').style.display = 'none'; document.getElementById('2503.15126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14185">arXiv:2503.14185</a> <span> [<a href="https://arxiv.org/pdf/2503.14185">pdf</a>, <a href="https://arxiv.org/format/2503.14185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AdaST: Dynamically Adapting Encoder States in the Decoder for End-to-End Speech-to-Text Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wuwei Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dexin Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+D">Deyi Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14185v1-abstract-short" style="display: inline;"> In end-to-end speech translation, acoustic representations learned by the encoder are usually fixed and static, from the perspective of the decoder, which is not desirable for dealing with the cross-modal and cross-lingual challenge in speech translation. In this paper, we show the benefits of varying acoustic states according to decoder hidden states and propose an adaptive speech-to-text transla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14185v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14185v1-abstract-full" style="display: none;"> In end-to-end speech translation, acoustic representations learned by the encoder are usually fixed and static, from the perspective of the decoder, which is not desirable for dealing with the cross-modal and cross-lingual challenge in speech translation. In this paper, we show the benefits of varying acoustic states according to decoder hidden states and propose an adaptive speech-to-text translation model that is able to dynamically adapt acoustic states in the decoder. We concatenate the acoustic state and target word embedding sequence and feed the concatenated sequence into subsequent blocks in the decoder. In order to model the deep interaction between acoustic states and target hidden states, a speech-text mixed attention sublayer is introduced to replace the conventional cross-attention network. Experiment results on two widely-used datasets show that the proposed method significantly outperforms state-of-the-art neural speech translation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14185v1-abstract-full').style.display = 'none'; document.getElementById('2503.14185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2021 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13948">arXiv:2503.13948</a> <span> [<a href="https://arxiv.org/pdf/2503.13948">pdf</a>, <a href="https://arxiv.org/format/2503.13948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Light4GS: Lightweight Compact 4D Gaussian Splatting Generation via Context Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mufan Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qi Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjie Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhenlong Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhu Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiling Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13948v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has emerged as an efficient and high-fidelity paradigm for novel view synthesis. To adapt 3DGS for dynamic content, deformable 3DGS incorporates temporally deformable primitives with learnable latent embeddings to capture complex motions. Despite its impressive performance, the high-dimensional embeddings and vast number of primitives lead to substantial storage requir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13948v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13948v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has emerged as an efficient and high-fidelity paradigm for novel view synthesis. To adapt 3DGS for dynamic content, deformable 3DGS incorporates temporally deformable primitives with learnable latent embeddings to capture complex motions. Despite its impressive performance, the high-dimensional embeddings and vast number of primitives lead to substantial storage requirements. In this paper, we introduce a \textbf{Light}weight \textbf{4}D\textbf{GS} framework, called Light4GS, that employs significance pruning with a deep context model to provide a lightweight storage-efficient dynamic 3DGS representation. The proposed Light4GS is based on 4DGS that is a typical representation of deformable 3DGS. Specifically, our framework is built upon two core components: (1) a spatio-temporal significance pruning strategy that eliminates over 64\% of the deformable primitives, followed by an entropy-constrained spherical harmonics compression applied to the remainder; and (2) a deep context model that integrates intra- and inter-prediction with hyperprior into a coarse-to-fine context structure to enable efficient multiscale latent embedding compression. Our approach achieves over 120x compression and increases rendering FPS up to 20\% compared to the baseline 4DGS, and also superior to frame-wise state-of-the-art 3DGS compression methods, revealing the effectiveness of our Light4GS in terms of both intra- and inter-prediction methods without sacrificing rendering quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13948v1-abstract-full').style.display = 'none'; document.getElementById('2503.13948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13820">arXiv:2503.13820</a> <span> [<a href="https://arxiv.org/pdf/2503.13820">pdf</a>, <a href="https://arxiv.org/format/2503.13820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> A Preliminary Investigation into Theory-Practice Barriers in Sino-New Zealand Undergraduate Computing Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+F">Fei Dai</a>, <a href="/search/cs?searchtype=author&query=Robins%2C+A">Anthony Robins</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhihao Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wanni Huang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chiu-Pih Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianzhen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13820v1-abstract-short" style="display: inline;"> This paper investigates the barriers hindering the effective transition from theoretical knowledge to practical application in a Sino-New Zealand double-degree undergraduate computing program. In this unique educational setting, students study at a campus in China but complete both Chinese and New Zealand courses taught jointly by lecturers from both countries. Through a questionnaire administered… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13820v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13820v1-abstract-full" style="display: none;"> This paper investigates the barriers hindering the effective transition from theoretical knowledge to practical application in a Sino-New Zealand double-degree undergraduate computing program. In this unique educational setting, students study at a campus in China but complete both Chinese and New Zealand courses taught jointly by lecturers from both countries. Through a questionnaire administered to these students, we identify critical obstacles such as insufficient foundational knowledge, language barriers, cultural and pedagogical differences, and difficulties adapting to distinct educational systems. Our analysis reveals that these barriers significantly affect students' academic performance, engagement, and skill development. Based on the findings, we propose targeted interventions, including specialized bridging courses, enhanced language support, refined teaching methods, and improved resource allocation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13820v1-abstract-full').style.display = 'none'; document.getElementById('2503.13820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted in CSEDU 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13265">arXiv:2503.13265</a> <span> [<a href="https://arxiv.org/pdf/2503.13265">pdf</a>, <a href="https://arxiv.org/format/2503.13265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlexWorld: Progressively Expanding 3D Scenes for Flexiable-View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Luxi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Min Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongxuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13265v2-abstract-short" style="display: inline;"> Generating flexible-view 3D scenes, including 360掳 rotation and zooming, from single images is challenging due to a lack of 3D data. To this end, we introduce FlexWorld, a novel framework consisting of two key components: (1) a strong video-to-video (V2V) diffusion model to generate high-quality novel view images from incomplete input rendered from a coarse scene, and (2) a progressive expansion p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13265v2-abstract-full').style.display = 'inline'; document.getElementById('2503.13265v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13265v2-abstract-full" style="display: none;"> Generating flexible-view 3D scenes, including 360掳 rotation and zooming, from single images is challenging due to a lack of 3D data. To this end, we introduce FlexWorld, a novel framework consisting of two key components: (1) a strong video-to-video (V2V) diffusion model to generate high-quality novel view images from incomplete input rendered from a coarse scene, and (2) a progressive expansion process to construct a complete 3D scene. In particular, leveraging an advanced pre-trained video model and accurate depth-estimated training pairs, our V2V model can generate novel views under large camera pose variations. Building upon it, FlexWorld progressively generates new 3D content and integrates it into the global scene through geometry-aware scene fusion. Extensive experiments demonstrate the effectiveness of FlexWorld in generating high-quality novel view videos and flexible-view 3D scenes from single images, achieving superior visual quality under multiple popular metrics and datasets compared to existing state-of-the-art methods. Qualitatively, we highlight that FlexWorld can generate high-fidelity scenes with flexible views like 360掳 rotations and zooming. Project page: https://ml-gsai.github.io/FlexWorld. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13265v2-abstract-full').style.display = 'none'; document.getElementById('2503.13265v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13120">arXiv:2503.13120</a> <span> [<a href="https://arxiv.org/pdf/2503.13120">pdf</a>, <a href="https://arxiv.org/format/2503.13120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Human Interaction Generation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+S">Siyuan Fan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiantao Cai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13120v1-abstract-short" style="display: inline;"> 3D human interaction generation has emerged as a key research area, focusing on producing dynamic and contextually relevant interactions between humans and various interactive entities. Recent rapid advancements in 3D model representation methods, motion capture technologies, and generative models have laid a solid foundation for the growing interest in this domain. Existing research in this field… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13120v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13120v1-abstract-full" style="display: none;"> 3D human interaction generation has emerged as a key research area, focusing on producing dynamic and contextually relevant interactions between humans and various interactive entities. Recent rapid advancements in 3D model representation methods, motion capture technologies, and generative models have laid a solid foundation for the growing interest in this domain. Existing research in this field can be broadly categorized into three areas: human-scene interaction, human-object interaction, and human-human interaction. Despite the rapid advancements in this area, challenges remain due to the need for naturalness in human motion generation and the accurate interaction between humans and interactive entities. In this survey, we present a comprehensive literature review of human interaction generation, which, to the best of our knowledge, is the first of its kind. We begin by introducing the foundational technologies, including model representations, motion capture methods, and generative models. Subsequently, we introduce the approaches proposed for the three sub-tasks, along with their corresponding datasets and evaluation metrics. Finally, we discuss potential future research directions in this area and conclude the survey. Through this survey, we aim to offer a comprehensive overview of the current advancements in the field, highlight key challenges, and inspire future research works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13120v1-abstract-full').style.display = 'none'; document.getElementById('2503.13120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12388">arXiv:2503.12388</a> <span> [<a href="https://arxiv.org/pdf/2503.12388">pdf</a>, <a href="https://arxiv.org/format/2503.12388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Serenade: A Singing Style Conversion Framework Based On Audio Infilling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12388v1-abstract-short" style="display: inline;"> We propose Serenade, a novel framework for the singing style conversion (SSC) task. Although singer identity conversion has made great strides in the previous years, converting the singing style of a singer has been an unexplored research area. We find three main challenges in SSC: modeling the target style, disentangling source style, and retaining the source melody. To model the target singing s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12388v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12388v1-abstract-full" style="display: none;"> We propose Serenade, a novel framework for the singing style conversion (SSC) task. Although singer identity conversion has made great strides in the previous years, converting the singing style of a singer has been an unexplored research area. We find three main challenges in SSC: modeling the target style, disentangling source style, and retaining the source melody. To model the target singing style, we use an audio infilling task by predicting a masked segment of the target mel-spectrogram with a flow-matching model using the complement of the masked target mel-spectrogram along with disentangled acoustic features. On the other hand, to disentangle the source singing style, we use a cyclic training approach, where we use synthetic converted samples as source inputs and reconstruct the original source mel-spectrogram as a target. Finally, to retain the source melody better, we investigate a post-processing module using a source-filter-based vocoder and resynthesize the converted waveforms using the original F0 patterns. Our results showed that the Serenade framework can handle generalized SSC tasks with the best overall similarity score, especially in modeling breathy and mixed singing styles. Moreover, although resynthesizing with the original F0 patterns alleviated out-of-tune singing and improved naturalness, we found a slight tradeoff in similarity due to not changing the F0 patterns into the target style. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12388v1-abstract-full').style.display = 'none'; document.getElementById('2503.12388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11080">arXiv:2503.11080</a> <span> [<a href="https://arxiv.org/pdf/2503.11080">pdf</a>, <a href="https://arxiv.org/format/2503.11080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Joint Training And Decoding for Multilingual End-to-End Simultaneous Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wuwei Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Renren Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+J">Jian Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+D">Deyi Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11080v1-abstract-short" style="display: inline;"> Recent studies on end-to-end speech translation(ST) have facilitated the exploration of multilingual end-to-end ST and end-to-end simultaneous ST. In this paper, we investigate end-to-end simultaneous speech translation in a one-to-many multilingual setting which is closer to applications in real scenarios. We explore a separate decoder architecture and a unified architecture for joint synchronous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11080v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11080v1-abstract-full" style="display: none;"> Recent studies on end-to-end speech translation(ST) have facilitated the exploration of multilingual end-to-end ST and end-to-end simultaneous ST. In this paper, we investigate end-to-end simultaneous speech translation in a one-to-many multilingual setting which is closer to applications in real scenarios. We explore a separate decoder architecture and a unified architecture for joint synchronous training in this scenario. To further explore knowledge transfer across languages, we propose an asynchronous training strategy on the proposed unified decoder architecture. A multi-way aligned multilingual end-to-end ST dataset was curated as a benchmark testbed to evaluate our methods. Experimental results demonstrate the effectiveness of our models on the collected dataset. Our codes and data are available at: https://github.com/XiaoMi/TED-MMST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11080v1-abstract-full').style.display = 'none'; document.getElementById('2503.11080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10705">arXiv:2503.10705</a> <span> [<a href="https://arxiv.org/pdf/2503.10705">pdf</a>, <a href="https://arxiv.org/format/2503.10705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Continual Learning of Vision-Language Models with Model Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haoyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zicong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuqi Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Linglan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guilin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yexin Li</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Linghe Kong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10705v2-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) represent a breakthrough in artificial intelligence by integrating visual and textual modalities to achieve impressive zero-shot capabilities. However, VLMs are susceptible to catastrophic forgetting when sequentially fine-tuned on multiple downstream tasks. Existing continual learning methods for VLMs often rely heavily on additional reference datasets, compromise ze… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10705v2-abstract-full').style.display = 'inline'; document.getElementById('2503.10705v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10705v2-abstract-full" style="display: none;"> Vision-Language Models (VLMs) represent a breakthrough in artificial intelligence by integrating visual and textual modalities to achieve impressive zero-shot capabilities. However, VLMs are susceptible to catastrophic forgetting when sequentially fine-tuned on multiple downstream tasks. Existing continual learning methods for VLMs often rely heavily on additional reference datasets, compromise zero-shot performance, or are limited to parameter-efficient fine-tuning scenarios. In this paper, we propose Continual Decoupling-Unifying (ConDU), a novel approach, by introducing model fusion into continual learning for VLMs. ConDU maintains a unified model along with task triggers and prototype sets, employing an iterative process of decoupling task-specific models for previous tasks and unifying them with the model for the newly learned task. Additionally, we introduce an inference strategy for zero-shot scenarios by aggregating predictions from multiple decoupled task-specific models. Extensive experiments across various settings show that ConDU achieves up to a 2\% improvement in average performance across all seen tasks compared to state-of-the-art baselines, while also enhancing zero-shot capabilities relative to the original VLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10705v2-abstract-full').style.display = 'none'; document.getElementById('2503.10705v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10471">arXiv:2503.10471</a> <span> [<a href="https://arxiv.org/pdf/2503.10471">pdf</a>, <a href="https://arxiv.org/format/2503.10471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Siamese Foundation Models for Crystal Structure Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Liming Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+R">Rui Jiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianxing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yipeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fuchun Sun</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuxiang Ren</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jirong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10471v1-abstract-short" style="display: inline;"> Crystal Structure Prediction (CSP), which aims to generate stable crystal structures from compositions, represents a critical pathway for discovering novel materials. While structure prediction tasks in other domains, such as proteins, have seen remarkable progress, CSP remains a relatively underexplored area due to the more complex geometries inherent in crystal structures. In this paper, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10471v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10471v1-abstract-full" style="display: none;"> Crystal Structure Prediction (CSP), which aims to generate stable crystal structures from compositions, represents a critical pathway for discovering novel materials. While structure prediction tasks in other domains, such as proteins, have seen remarkable progress, CSP remains a relatively underexplored area due to the more complex geometries inherent in crystal structures. In this paper, we propose Siamese foundation models specifically designed to address CSP. Our pretrain-finetune framework, named DAO, comprises two complementary foundation models: DAO-G for structure generation and DAO-P for energy prediction. Experiments on CSP benchmarks (MP-20 and MPTS-52) demonstrate that our DAO-G significantly surpasses state-of-the-art (SOTA) methods across all metrics. Extensive ablation studies further confirm that DAO-G excels in generating diverse polymorphic structures, and the dataset relaxation and energy guidance provided by DAO-P are essential for enhancing DAO-G's performance. When applied to three real-world superconductors ($\text{CsV}_3\text{Sb}_5$, $ \text{Zr}_{16}\text{Rh}_8\text{O}_4$ and $\text{Zr}_{16}\text{Pd}_8\text{O}_4$) that are known to be challenging to analyze, our foundation models achieve accurate critical temperature predictions and structure generations. For instance, on $\text{CsV}_3\text{Sb}_5$, DAO-G generates a structure close to the experimental one with an RMSE of 0.0085; DAO-P predicts the $T_c$ value with high accuracy (2.26 K vs. the ground-truth value of 2.30 K). In contrast, conventional DFT calculators like Quantum Espresso only successfully derive the structure of the first superconductor within an acceptable time, while the RMSE is nearly 8 times larger, and the computation speed is more than 1000 times slower. These compelling results collectively highlight the potential of our approach for advancing materials science research and development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10471v1-abstract-full').style.display = 'none'; document.getElementById('2503.10471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09117">arXiv:2503.09117</a> <span> [<a href="https://arxiv.org/pdf/2503.09117">pdf</a>, <a href="https://arxiv.org/format/2503.09117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GRU: Mitigating the Trade-off between Unlearning and Retention for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yali Du</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaojiang Du</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09117v1-abstract-short" style="display: inline;"> Large language model (LLM) unlearning has demonstrated its essential role in removing privacy and copyright-related responses, crucial for their legal and safe applications. However, the pursuit of complete unlearning often comes with substantial costs due to its compromises in their general functionality, leading to a notorious trade-off between unlearning and retention. In examining the update p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09117v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09117v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09117v1-abstract-full" style="display: none;"> Large language model (LLM) unlearning has demonstrated its essential role in removing privacy and copyright-related responses, crucial for their legal and safe applications. However, the pursuit of complete unlearning often comes with substantial costs due to its compromises in their general functionality, leading to a notorious trade-off between unlearning and retention. In examining the update process for unlearning dynamically, we find gradients hold essential information for revealing this trade-off. In particular, we look at the varying relationship between retention performance and directional disparities between gradients during unlearning. It motivates the sculpting of an update mechanism derived from gradients from two sources, i.e., harmful for retention and useful for unlearning. Accordingly, we propose Gradient Rectified Unlearning (GRU), an enhanced unlearning framework controlling the updating gradients in a geometry-focused and optimization-driven manner such that their side impacts on other, unrelated responses can be minimized. Specifically, GRU derives a closed-form solution to project the unlearning gradient onto the orthogonal space of that gradient harmful for retention, ensuring minimal deviation from its original direction under the condition that overall performance is retained. Comprehensive experiments are conducted to demonstrate that GRU, as a general framework, is straightforward to implement and efficiently enhances a range of baseline methods through its adaptable and compatible characteristics. Additionally, experimental results show its broad effectiveness across a diverse set of benchmarks for LLM unlearning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09117v1-abstract-full').style.display = 'none'; document.getElementById('2503.09117v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08864">arXiv:2503.08864</a> <span> [<a href="https://arxiv.org/pdf/2503.08864">pdf</a>, <a href="https://arxiv.org/format/2503.08864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Soft Condensed Matter">cond-mat.soft</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Real-time simulation enabled navigation control of magnetic soft continuum robots in confined lumens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+D">Dezhong Tong</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhuonan Hao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiyu Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Boxi Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingchao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liu Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weicheng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08864v1-abstract-short" style="display: inline;"> Magnetic soft continuum robots (MSCRs) have emerged as a promising technology for minimally invasive interventions, offering enhanced dexterity and remote-controlled navigation in confined lumens. Unlike conventional guidewires with pre-shaped tips, MSCRs feature a magnetic tip that actively bends under applied magnetic fields. Despite extensive studies in modeling and simulation, achieving real-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08864v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08864v1-abstract-full" style="display: none;"> Magnetic soft continuum robots (MSCRs) have emerged as a promising technology for minimally invasive interventions, offering enhanced dexterity and remote-controlled navigation in confined lumens. Unlike conventional guidewires with pre-shaped tips, MSCRs feature a magnetic tip that actively bends under applied magnetic fields. Despite extensive studies in modeling and simulation, achieving real-time navigation control of MSCRs in confined lumens remains a significant challenge. The primary reasons are due to robot-lumen contact interactions and computational limitations in modeling MSCR nonlinear behavior under magnetic actuation. Existing approaches, such as Finite Element Method (FEM) simulations and energy-minimization techniques, suffer from high computational costs and oversimplified contact interactions, making them impractical for real-world applications. In this work, we develop a real-time simulation and navigation control framework that integrates hard-magnetic elastic rod theory, formulated within the Discrete Differential Geometry (DDG) framework, with an order-reduced contact handling strategy. Our approach captures large deformations and complex interactions while maintaining computational efficiency. Next, the navigation control problem is formulated as an inverse design task, where optimal magnetic fields are computed in real time by minimizing the constrained forces and enhancing navigation accuracy. We validate the proposed framework through comprehensive numerical simulations and experimental studies, demonstrating its robustness, efficiency, and accuracy. The results show that our method significantly reduces computational costs while maintaining high-fidelity modeling, making it feasible for real-time deployment in clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08864v1-abstract-full').style.display = 'none'; document.getElementById('2503.08864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08689">arXiv:2503.08689</a> <span> [<a href="https://arxiv.org/pdf/2503.08689">pdf</a>, <a href="https://arxiv.org/format/2503.08689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> QuoTA: Query-oriented Token Assignment via CoT Query Decouple for Long Video Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yongdong Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wang Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiawu Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weizhong Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shukang Yin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haojia Lin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiayi Ji</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08689v1-abstract-short" style="display: inline;"> Recent advances in long video understanding typically mitigate visual redundancy through visual token pruning based on attention distribution. However, while existing methods employ post-hoc low-response token pruning in decoder layers, they overlook the input-level semantic correlation between visual tokens and instructions (query). In this paper, we propose QuoTA, an ante-hoc training-free modul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08689v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08689v1-abstract-full" style="display: none;"> Recent advances in long video understanding typically mitigate visual redundancy through visual token pruning based on attention distribution. However, while existing methods employ post-hoc low-response token pruning in decoder layers, they overlook the input-level semantic correlation between visual tokens and instructions (query). In this paper, we propose QuoTA, an ante-hoc training-free modular that extends existing large video-language models (LVLMs) for visual token assignment based on query-oriented frame-level importance assessment. The query-oriented token selection is crucial as it aligns visual processing with task-specific requirements, optimizing token budget utilization while preserving semantically relevant content. Specifically, (i) QuoTA strategically allocates frame-level importance scores based on query relevance, enabling one-time visual token assignment before cross-modal interactions in decoder layers, (ii) we decouple the query through Chain-of-Thoughts reasoning to facilitate more precise LVLM-based frame importance scoring, and (iii) QuoTA offers a plug-and-play functionality that extends to existing LVLMs. Extensive experimental results demonstrate that implementing QuoTA with LLaVA-Video-7B yields an average performance improvement of 3.2% across six benchmarks (including Video-MME and MLVU) while operating within an identical visual token budget as the baseline. Codes are open-sourced at https://github.com/MAC-AutoML/QuoTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08689v1-abstract-full').style.display = 'none'; document.getElementById('2503.08689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://github.com/MAC-AutoML/QuoTA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08638">arXiv:2503.08638</a> <span> [<a href="https://arxiv.org/pdf/2503.08638">pdf</a>, <a href="https://arxiv.org/format/2503.08638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> YuE: Scaling Open Foundation Models for Long-Form Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hanfeng Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yongyi Zang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenye Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zeyue Tian</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziya Zhou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xingwei Qu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tianhao Shen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jun Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunhui Wang</a> , et al. (32 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08638v1-abstract-short" style="display: inline;"> We tackle the task of long-form music generation--particularly the challenging \textbf{lyrics-to-song} problem--by introducing YuE, a family of open foundation models based on the LLaMA2 architecture. Specifically, YuE scales to trillions of tokens and generates up to five minutes of music while maintaining lyrical alignment, coherent musical structure, and engaging vocal melodies with appropriate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08638v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08638v1-abstract-full" style="display: none;"> We tackle the task of long-form music generation--particularly the challenging \textbf{lyrics-to-song} problem--by introducing YuE, a family of open foundation models based on the LLaMA2 architecture. Specifically, YuE scales to trillions of tokens and generates up to five minutes of music while maintaining lyrical alignment, coherent musical structure, and engaging vocal melodies with appropriate accompaniment. It achieves this through (1) track-decoupled next-token prediction to overcome dense mixture signals, (2) structural progressive conditioning for long-context lyrical alignment, and (3) a multitask, multiphase pre-training recipe to converge and generalize. In addition, we redesign the in-context learning technique for music generation, enabling versatile style transfer (e.g., converting Japanese city pop into an English rap while preserving the original accompaniment) and bidirectional generation. Through extensive evaluation, we demonstrate that YuE matches or even surpasses some of the proprietary systems in musicality and vocal agility. In addition, fine-tuning YuE enables additional controls and enhanced support for tail languages. Furthermore, beyond generation, we show that YuE's learned representations can perform well on music understanding tasks, where the results of YuE match or exceed state-of-the-art methods on the MARBLE benchmark. Keywords: lyrics2song, song generation, long-form, foundation model, music generation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08638v1-abstract-full').style.display = 'none'; document.getElementById('2503.08638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/multimodal-art-projection/YuE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08175">arXiv:2503.08175</a> <span> [<a href="https://arxiv.org/pdf/2503.08175">pdf</a>, <a href="https://arxiv.org/format/2503.08175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Enhancing Paradigms within Federated Multi-Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zitong Shi</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jiawei Shao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Mang Ye</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08175v1-abstract-short" style="display: inline;"> LLM-based Multi-Agent Systems (MAS) have proven highly effective in solving complex problems by integrating multiple agents, each performing different roles. However, in sensitive domains, they face emerging privacy protection challenges. In this paper, we introduce the concept of Federated MAS, highlighting the fundamental differences between Federated MAS and traditional FL. We then identify key… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08175v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08175v1-abstract-full" style="display: none;"> LLM-based Multi-Agent Systems (MAS) have proven highly effective in solving complex problems by integrating multiple agents, each performing different roles. However, in sensitive domains, they face emerging privacy protection challenges. In this paper, we introduce the concept of Federated MAS, highlighting the fundamental differences between Federated MAS and traditional FL. We then identify key challenges in developing Federated MAS, including: 1) heterogeneous privacy protocols among agents, 2) structural differences in multi-party conversations, and 3) dynamic conversational network structures. To address these challenges, we propose Embedded Privacy-Enhancing Agents (EPEAgent), an innovative solution that integrates seamlessly into the Retrieval-Augmented Generation (RAG) phase and the context retrieval stage. This solution minimizes data flows, ensuring that only task-relevant, agent-specific information is shared. Additionally, we design and generate a comprehensive dataset to evaluate the proposed paradigm. Extensive experiments demonstrate that EPEAgent effectively enhances privacy protection while maintaining strong system performance. The code will be availiable at https://github.com/ZitongShi/EPEAgent <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08175v1-abstract-full').style.display = 'none'; document.getElementById('2503.08175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08069">arXiv:2503.08069</a> <span> [<a href="https://arxiv.org/pdf/2503.08069">pdf</a>, <a href="https://arxiv.org/format/2503.08069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Disordered Systems and Neural Networks">cond-mat.dis-nn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chaotic Dynamics">nlin.CD</span> </div> </div> <p class="title is-5 mathjax"> Freezing chaos without synaptic plasticity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weizhong Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haiping Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08069v1-abstract-short" style="display: inline;"> Chaos is ubiquitous in high-dimensional neural dynamics. A strong chaotic fluctuation may be harmful to information processing. A traditional way to mitigate this issue is to introduce Hebbian plasticity, which can stabilize the dynamics. Here, we introduce another distinct way without synaptic plasticity. An Onsager reaction term due to the feedback of the neuron itself is added to the vanilla re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08069v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08069v1-abstract-full" style="display: none;"> Chaos is ubiquitous in high-dimensional neural dynamics. A strong chaotic fluctuation may be harmful to information processing. A traditional way to mitigate this issue is to introduce Hebbian plasticity, which can stabilize the dynamics. Here, we introduce another distinct way without synaptic plasticity. An Onsager reaction term due to the feedback of the neuron itself is added to the vanilla recurrent dynamics, making the driving force a gradient form. The original unstable fixed points supporting the chaotic fluctuation can then be approached by further decreasing the kinetic energy of the dynamics. We show that this freezing effect also holds in more biologically realistic networks, such as those composed of excitatory and inhibitory neurons. The gradient dynamics are also useful for computational tasks such as recalling or predicting external time-dependent stimuli. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08069v1-abstract-full').style.display = 'none'; document.getElementById('2503.08069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07703">arXiv:2503.07703</a> <span> [<a href="https://arxiv.org/pdf/2503.07703">pdf</a>, <a href="https://arxiv.org/format/2503.07703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Seedream 2.0: A Native Chinese-English Bilingual Image Generation Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+L">Lixue Gong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xiaoxia Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fanshi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xiaochen Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liyang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shiqi Sun</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yu Tian</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zhi Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guofeng Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jie Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xuefeng Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linjie Yang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zhonghua Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuwei Zhang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07703v1-abstract-short" style="display: inline;"> Rapid advancement of diffusion models has catalyzed remarkable progress in the field of image generation. However, prevalent models such as Flux, SD3.5 and Midjourney, still grapple with issues like model bias, limited text rendering capabilities, and insufficient understanding of Chinese cultural nuances. To address these limitations, we present Seedream 2.0, a native Chinese-English bilingual im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07703v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07703v1-abstract-full" style="display: none;"> Rapid advancement of diffusion models has catalyzed remarkable progress in the field of image generation. However, prevalent models such as Flux, SD3.5 and Midjourney, still grapple with issues like model bias, limited text rendering capabilities, and insufficient understanding of Chinese cultural nuances. To address these limitations, we present Seedream 2.0, a native Chinese-English bilingual image generation foundation model that excels across diverse dimensions, which adeptly manages text prompt in both Chinese and English, supporting bilingual image generation and text rendering. We develop a powerful data system that facilitates knowledge integration, and a caption system that balances the accuracy and richness for image description. Particularly, Seedream is integrated with a self-developed bilingual large language model as a text encoder, allowing it to learn native knowledge directly from massive data. This enable it to generate high-fidelity images with accurate cultural nuances and aesthetic expressions described in either Chinese or English. Beside, Glyph-Aligned ByT5 is applied for flexible character-level text rendering, while a Scaled ROPE generalizes well to untrained resolutions. Multi-phase post-training optimizations, including SFT and RLHF iterations, further improve the overall capability. Through extensive experimentation, we demonstrate that Seedream 2.0 achieves state-of-the-art performance across multiple aspects, including prompt-following, aesthetics, text rendering, and structural correctness. Furthermore, Seedream 2.0 has been optimized through multiple RLHF iterations to closely align its output with human preferences, as revealed by its outstanding ELO score. In addition, it can be readily adapted to an instruction-based image editing model, such as SeedEdit, with strong editing capability that balances instruction-following and image consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07703v1-abstract-full').style.display = 'none'; document.getElementById('2503.07703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Official Page: https://team.doubao.com/tech/seedream</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07487">arXiv:2503.07487</a> <span> [<a href="https://arxiv.org/pdf/2503.07487">pdf</a>, <a href="https://arxiv.org/format/2503.07487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-RadZ: Can Multimodal Large Language Models Effectively Tackle Zero-shot Radiology Recognition? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bangyan Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenxuan Huang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yeqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shaohui Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jingzhong Lin</a>, <a href="/search/cs?searchtype=author&query=You%2C+L">Ling You</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xing Sun</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuling Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07487v1-abstract-short" style="display: inline;"> Recently, multimodal large models (MLLMs) have demonstrated exceptional capabilities in visual understanding and reasoning across various vision-language tasks. However, MLLMs usually perform poorly in zero-shot medical disease recognition, as they do not fully exploit the captured features and available medical knowledge. To address this challenge, we propose LLaVA-RadZ, a simple yet effective fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07487v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07487v1-abstract-full" style="display: none;"> Recently, multimodal large models (MLLMs) have demonstrated exceptional capabilities in visual understanding and reasoning across various vision-language tasks. However, MLLMs usually perform poorly in zero-shot medical disease recognition, as they do not fully exploit the captured features and available medical knowledge. To address this challenge, we propose LLaVA-RadZ, a simple yet effective framework for zero-shot medical disease recognition. Specifically, we design an end-to-end training strategy, termed Decoding-Side Feature Alignment Training (DFAT) to take advantage of the characteristics of the MLLM decoder architecture and incorporate modality-specific tokens tailored for different modalities, which effectively utilizes image and text representations and facilitates robust cross-modal alignment. Additionally, we introduce a Domain Knowledge Anchoring Module (DKAM) to exploit the intrinsic medical knowledge of large models, which mitigates the category semantic gap in image-text alignment. DKAM improves category-level alignment, allowing for accurate disease recognition. Extensive experiments on multiple benchmarks demonstrate that our LLaVA-RadZ significantly outperforms traditional MLLMs in zero-shot disease recognition and exhibits the state-of-the-art performance compared to the well-established and highly-optimized CLIP-based approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07487v1-abstract-full').style.display = 'none'; document.getElementById('2503.07487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06866">arXiv:2503.06866</a> <span> [<a href="https://arxiv.org/pdf/2503.06866">pdf</a>, <a href="https://arxiv.org/format/2503.06866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Graphormer-Guided Task Planning: Beyond Static Rules with LLM Safety Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tongjie Pan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yalan Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06866v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have expanded their role in robotic task planning. However, while LLMs have been explored for generating feasible task sequences, their ability to ensure safe task execution remains underdeveloped. Existing methods struggle with structured risk perception, making them inadequate for safety-critical applications where low-latency hazard adaptation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06866v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06866v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have expanded their role in robotic task planning. However, while LLMs have been explored for generating feasible task sequences, their ability to ensure safe task execution remains underdeveloped. Existing methods struggle with structured risk perception, making them inadequate for safety-critical applications where low-latency hazard adaptation is required. To address this limitation, we propose a Graphormer-enhanced risk-aware task planning framework that combines LLM-based decision-making with structured safety modeling. Our approach constructs a dynamic spatio-semantic safety graph, capturing spatial and contextual risk factors to enable online hazard detection and adaptive task refinement. Unlike existing methods that rely on predefined safety constraints, our framework introduces a context-aware risk perception module that continuously refines safety predictions based on real-time task execution. This enables a more flexible and scalable approach to robotic planning, allowing for adaptive safety compliance beyond static rules. To validate our framework, we conduct experiments in the AI2-THOR environment. The experiments results validates improvements in risk detection accuracy, rising safety notice, and task adaptability of our framework in continuous environments compared to static rule-based and LLM-only baselines. Our project is available at https://github.com/hwj20/GGTP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06866v1-abstract-full').style.display = 'none'; document.getElementById('2503.06866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06749">arXiv:2503.06749</a> <span> [<a href="https://arxiv.org/pdf/2503.06749">pdf</a>, <a href="https://arxiv.org/format/2503.06749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Vision-R1: Incentivizing Reasoning Capability in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenxuan Huang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+B">Bohan Jia</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zijie Zhai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shaosheng Cao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zheyu Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Fei Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shaohui Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06749v2-abstract-short" style="display: inline;"> DeepSeek-R1-Zero has successfully demonstrated the emergence of reasoning capabilities in LLMs purely through Reinforcement Learning (RL). Inspired by this breakthrough, we explore how RL can be utilized to enhance the reasoning capability of MLLMs. However, direct training with RL struggles to activate complex reasoning capabilities such as questioning and reflection in MLLMs, due to the absence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06749v2-abstract-full').style.display = 'inline'; document.getElementById('2503.06749v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06749v2-abstract-full" style="display: none;"> DeepSeek-R1-Zero has successfully demonstrated the emergence of reasoning capabilities in LLMs purely through Reinforcement Learning (RL). Inspired by this breakthrough, we explore how RL can be utilized to enhance the reasoning capability of MLLMs. However, direct training with RL struggles to activate complex reasoning capabilities such as questioning and reflection in MLLMs, due to the absence of substantial high-quality multimodal reasoning data. To address this issue, we propose the reasoning MLLM, Vision-R1, to improve multimodal reasoning capability. Specifically, we first construct a high-quality multimodal CoT dataset without human annotations by leveraging an existing MLLM and DeepSeek-R1 through modality bridging and data filtering to obtain a 200K multimodal CoT dataset, Vision-R1-cold dataset. It serves as cold-start initialization data for Vision-R1. To mitigate the optimization challenges caused by overthinking after cold start, we propose Progressive Thinking Suppression Training (PTST) strategy and employ Group Relative Policy Optimization (GRPO) with the hard formatting result reward function to gradually refine the model's ability to learn correct and complex reasoning processes on a 10K multimodal math dataset. Comprehensive experiments show our model achieves an average improvement of $\sim$6% across various multimodal math reasoning benchmarks. Vision-R1-7B achieves a 73.5% accuracy on the widely used MathVista benchmark, which is only 0.4% lower than the leading reasoning model, OpenAI O1. The datasets and code will be released in: https://github.com/Osilly/Vision-R1 . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06749v2-abstract-full').style.display = 'none'; document.getElementById('2503.06749v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06457">arXiv:2503.06457</a> <span> [<a href="https://arxiv.org/pdf/2503.06457">pdf</a>, <a href="https://arxiv.org/format/2503.06457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Geometric Knowledge-Guided Localized Global Distribution Alignment for Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yanbiao Ma</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06457v1-abstract-short" style="display: inline;"> Data heterogeneity in federated learning, characterized by a significant misalignment between local and global distributions, leads to divergent local optimization directions and hinders global model training. Existing studies mainly focus on optimizing local updates or global aggregation, but these indirect approaches demonstrate instability when handling highly heterogeneous data distributions,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06457v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06457v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06457v1-abstract-full" style="display: none;"> Data heterogeneity in federated learning, characterized by a significant misalignment between local and global distributions, leads to divergent local optimization directions and hinders global model training. Existing studies mainly focus on optimizing local updates or global aggregation, but these indirect approaches demonstrate instability when handling highly heterogeneous data distributions, especially in scenarios where label skew and domain skew coexist. To address this, we propose a geometry-guided data generation method that centers on simulating the global embedding distribution locally. We first introduce the concept of the geometric shape of an embedding distribution and then address the challenge of obtaining global geometric shapes under privacy constraints. Subsequently, we propose GGEUR, which leverages global geometric shapes to guide the generation of new samples, enabling a closer approximation to the ideal global distribution. In single-domain scenarios, we augment samples based on global geometric shapes to enhance model generalization; in multi-domain scenarios, we further employ class prototypes to simulate the global distribution across domains. Extensive experimental results demonstrate that our method significantly enhances the performance of existing approaches in handling highly heterogeneous data, including scenarios with label skew, domain skew, and their coexistence. Code published at: https://github.com/WeiDai-David/2025CVPR_GGEUR <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06457v1-abstract-full').style.display = 'none'; document.getElementById('2503.06457v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06252">arXiv:2503.06252</a> <span> [<a href="https://arxiv.org/pdf/2503.06252">pdf</a>, <a href="https://arxiv.org/format/2503.06252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Atomic Step Decomposition Enhance the Self-structured Reasoning of Multimodal Large Models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+K">Kun Xiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kaixin Cai</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yiyang Yin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Runhui Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Haoxiang Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanhui Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiran Huang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yihan Zeng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yu-Jie Yuan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06252v1-abstract-short" style="display: inline;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of "slow thinking" into multimodal large language models (MLLMs). Our core idea is that different levels of reasoning abilities can be combined dynamically to tackle questions with different complexity. To this end, we propose a paradigm of Self-structured Chain of Thought (SCoT), which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06252v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06252v1-abstract-full" style="display: none;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of "slow thinking" into multimodal large language models (MLLMs). Our core idea is that different levels of reasoning abilities can be combined dynamically to tackle questions with different complexity. To this end, we propose a paradigm of Self-structured Chain of Thought (SCoT), which is composed of minimal semantic atomic steps. Different from existing methods that rely on structured templates or free-form paradigms, our method can not only generate cognitive CoT structures for various complex tasks but also mitigates the phenomenon of overthinking. To introduce structured reasoning capabilities into visual understanding models, we further design a novel AtomThink framework with four key modules, including (i) a data engine to generate high-quality multimodal reasoning paths; (ii) a supervised fine-tuning process with serialized inference data; (iii) a policy-guided multi-turn inference method; and (iv) an atomic capability metric to evaluate the single step utilization rate. We conduct extensive experiments to show that the proposed AtomThink significantly improves the performance of baseline MLLMs, achieving more than 10\% average accuracy gains on MathVista and MathVerse. Compared to state-of-the-art structured CoT approaches, our method not only achieves higher accuracy but also improves data utilization by 5 times and boosts inference efficiency by 85.3\%. Our code is now public available in https://github.com/Quinn777/AtomThink. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06252v1-abstract-full').style.display = 'none'; document.getElementById('2503.06252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04543">arXiv:2503.04543</a> <span> [<a href="https://arxiv.org/pdf/2503.04543">pdf</a>, <a href="https://arxiv.org/format/2503.04543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Keeping Yourself is Important in Downstream Tuning Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jian Liang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yiyang Fang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+X">Xuankun Rong</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chi Wen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zekun Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Didi Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yanbiao Ma</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Ke Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bin Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">He Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jiawei Shao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Mang Ye</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04543v1-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) integrate visual and linguistic reasoning to address complex tasks such as image captioning and visual question answering. While MLLMs demonstrate remarkable versatility, MLLMs appears limited performance on special applications. But tuning MLLMs for downstream tasks encounters two key challenges: Task-Expert Specialization, where distribution shifts betwe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04543v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04543v1-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) integrate visual and linguistic reasoning to address complex tasks such as image captioning and visual question answering. While MLLMs demonstrate remarkable versatility, MLLMs appears limited performance on special applications. But tuning MLLMs for downstream tasks encounters two key challenges: Task-Expert Specialization, where distribution shifts between pre-training and target datasets constrain target performance, and Open-World Stabilization, where catastrophic forgetting erases the model general knowledge. In this work, we systematically review recent advancements in MLLM tuning methodologies, classifying them into three paradigms: (I) Selective Tuning, (II) Additive Tuning, and (III) Reparameterization Tuning. Furthermore, we benchmark these tuning strategies across popular MLLM architectures and diverse downstream tasks to establish standardized evaluation analysis and systematic tuning principles. Finally, we highlight several open challenges in this domain and propose future research directions. To facilitate ongoing progress in this rapidly evolving field, we provide a public repository that continuously tracks developments: https://github.com/WenkeHuang/Awesome-MLLM-Tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04543v1-abstract-full').style.display = 'none'; document.getElementById('2503.04543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03747">arXiv:2503.03747</a> <span> [<a href="https://arxiv.org/pdf/2503.03747">pdf</a>, <a href="https://arxiv.org/format/2503.03747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PacketCLIP: Multi-Modal Embedding of Network Traffic and Language for Cybersecurity Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Masukawa%2C+R">Ryozo Masukawa</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+S">Sanggeon Yun</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+S">Sungheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjun Huang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yang Ni</a>, <a href="/search/cs?searchtype=author&query=Bryant%2C+I">Ian Bryant</a>, <a href="/search/cs?searchtype=author&query=Bastian%2C+N+D">Nathaniel D. Bastian</a>, <a href="/search/cs?searchtype=author&query=Imani%2C+M">Mohsen Imani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03747v1-abstract-short" style="display: inline;"> Traffic classification is vital for cybersecurity, yet encrypted traffic poses significant challenges. We present PacketCLIP, a multi-modal framework combining packet data with natural language semantics through contrastive pretraining and hierarchical Graph Neural Network (GNN) reasoning. PacketCLIP integrates semantic reasoning with efficient classification, enabling robust detection of anomalie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03747v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03747v1-abstract-full" style="display: none;"> Traffic classification is vital for cybersecurity, yet encrypted traffic poses significant challenges. We present PacketCLIP, a multi-modal framework combining packet data with natural language semantics through contrastive pretraining and hierarchical Graph Neural Network (GNN) reasoning. PacketCLIP integrates semantic reasoning with efficient classification, enabling robust detection of anomalies in encrypted network flows. By aligning textual descriptions with packet behaviors, it offers enhanced interpretability, scalability, and practical applicability across diverse security scenarios. PacketCLIP achieves a 95% mean AUC, outperforms baselines by 11.6%, and reduces model size by 92%, making it ideal for real-time anomaly detection. By bridging advanced machine learning techniques and practical cybersecurity needs, PacketCLIP provides a foundation for scalable, efficient, and interpretable solutions to tackle encrypted traffic classification and network intrusion detection challenges in resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03747v1-abstract-full').style.display = 'none'; document.getElementById('2503.03747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01901">arXiv:2503.01901</a> <span> [<a href="https://arxiv.org/pdf/2503.01901">pdf</a>, <a href="https://arxiv.org/format/2503.01901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Identifying Sensitive Weights via Post-quantization Integral </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuezhou Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zichen Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01901v1-abstract-short" style="display: inline;"> Serving Large Language Models (LLMs) is costly. However, post-training weight quantization can address this problem by both compressing their sizes for limited memory and saving bandwidth for acceleration. As not all weight dimensions are equally important, those methods typically rely on a sensitivity metric, which indicates the element-wise influence of weights on loss function and is used to pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01901v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01901v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01901v1-abstract-full" style="display: none;"> Serving Large Language Models (LLMs) is costly. However, post-training weight quantization can address this problem by both compressing their sizes for limited memory and saving bandwidth for acceleration. As not all weight dimensions are equally important, those methods typically rely on a sensitivity metric, which indicates the element-wise influence of weights on loss function and is used to preprocess original weights for better quantization. In this work, we conduct an empirical study on the accuracy of the sensitivity metric, and find that existing gradient and Hessian based metrics are very inaccurate: they underestimate quantization's impact on the loss function by orders of magnitude, mainly due to the small convergence radius of local 2nd order approximation, \ie, gradient and Hessian term in Taylor's formula. To tackle this problem, we propose Post-quantization Integral (PQI), an accurate metric to estimate posterior sensitivity in a fine-grained manner. To leverage this accurate metric, we further propose ReQuant, a simple yet powerful framework that mainly consists of two Dense-and-Sparse detach components: self-adaptive outlier selection and step-wise significant weights detach. Results show that ReQuant boosts state-of-the-art post-training quantization methods, with a pronounced improvement of 2.66 perplexity gain on Llama 3.2 1B with QTIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01901v1-abstract-full').style.display = 'none'; document.getElementById('2503.01901v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01178">arXiv:2503.01178</a> <span> [<a href="https://arxiv.org/pdf/2503.01178">pdf</a>, <a href="https://arxiv.org/format/2503.01178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Information Enhanced Model-Based Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xinyan Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weidong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Song-Chun Zhu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Siyuan Qi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01178v1-abstract-short" style="display: inline;"> Differentiable environments have heralded new possibilities for learning control policies by offering rich differentiable information that facilitates gradient-based methods. In comparison to prevailing model-free reinforcement learning approaches, model-based reinforcement learning (MBRL) methods exhibit the potential to effectively harness the power of differentiable information for recovering t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01178v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01178v1-abstract-full" style="display: none;"> Differentiable environments have heralded new possibilities for learning control policies by offering rich differentiable information that facilitates gradient-based methods. In comparison to prevailing model-free reinforcement learning approaches, model-based reinforcement learning (MBRL) methods exhibit the potential to effectively harness the power of differentiable information for recovering the underlying physical dynamics. However, this presents two primary challenges: effectively utilizing differentiable information to 1) construct models with more accurate dynamic prediction and 2) enhance the stability of policy training. In this paper, we propose a Differentiable Information Enhanced MBRL method, MB-MIX, to address both challenges. Firstly, we adopt a Sobolev model training approach that penalizes incorrect model gradient outputs, enhancing prediction accuracy and yielding more precise models that faithfully capture system dynamics. Secondly, we introduce mixing lengths of truncated learning windows to reduce the variance in policy gradient estimation, resulting in improved stability during policy learning. To validate the effectiveness of our approach in differentiable environments, we provide theoretical analysis and empirical results. Notably, our approach outperforms previous model-based and model-free methods, in multiple challenging tasks involving controllable rigid robots such as humanoid robots' motion control and deformable object manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01178v1-abstract-full').style.display = 'none'; document.getElementById('2503.01178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20763">arXiv:2502.20763</a> <span> [<a href="https://arxiv.org/pdf/2502.20763">pdf</a>, <a href="https://arxiv.org/format/2502.20763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Information-Theoretic Perspectives on Optimizers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiquan Tan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20763v1-abstract-short" style="display: inline;"> The interplay of optimizers and architectures in neural networks is complicated and hard to understand why some optimizers work better on some specific architectures. In this paper, we find that the traditionally used sharpness metric does not fully explain the intricate interplay and introduces information-theoretic metrics called entropy gap to better help analyze. It is found that both sharpnes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20763v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20763v1-abstract-full" style="display: none;"> The interplay of optimizers and architectures in neural networks is complicated and hard to understand why some optimizers work better on some specific architectures. In this paper, we find that the traditionally used sharpness metric does not fully explain the intricate interplay and introduces information-theoretic metrics called entropy gap to better help analyze. It is found that both sharpness and entropy gap affect the performance, including the optimization dynamic and generalization. We further use information-theoretic tools to understand a recently proposed optimizer called Lion and find ways to improve it. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20763v1-abstract-full').style.display = 'none'; document.getElementById('2502.20763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19668">arXiv:2502.19668</a> <span> [<a href="https://arxiv.org/pdf/2502.19668">pdf</a>, <a href="https://arxiv.org/format/2502.19668">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SuPreME: A Supervised Pre-training Framework for Multimodal ECG Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+M">Mingsheng Cai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&query=Arcucci%2C+R">Rossella Arcucci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19668v1-abstract-short" style="display: inline;"> Cardiovascular diseases are a leading cause of death and disability worldwide. Electrocardiogram (ECG) recordings are critical for diagnosing and monitoring cardiac health, but obtaining large-scale annotated ECG datasets is labor-intensive and time-consuming. Recent ECG Self-Supervised Learning (eSSL) methods mitigate this by learning features without extensive labels but fail to capture fine-gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19668v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19668v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19668v1-abstract-full" style="display: none;"> Cardiovascular diseases are a leading cause of death and disability worldwide. Electrocardiogram (ECG) recordings are critical for diagnosing and monitoring cardiac health, but obtaining large-scale annotated ECG datasets is labor-intensive and time-consuming. Recent ECG Self-Supervised Learning (eSSL) methods mitigate this by learning features without extensive labels but fail to capture fine-grained clinical semantics and require extensive task-specific fine-tuning. To address these challenges, we propose $\textbf{SuPreME}$, a $\textbf{Su}$pervised $\textbf{Pre}$-training framework for $\textbf{M}$ultimodal $\textbf{E}$CG representation learning. SuPreME applies Large Language Models (LLMs) to extract structured clinical entities from free-text ECG reports, filter out noise and irrelevant content, enhance clinical representation learning, and build a high-quality, fine-grained labeled dataset. By using text-based cardiac queries instead of traditional categorical labels, SuPreME enables zero-shot classification of unseen diseases without additional fine-tuning. We evaluate SuPreME on six downstream datasets covering 127 cardiac conditions, achieving superior zero-shot AUC performance over state-of-the-art eSSL and multimodal methods by over 1.96\%. Results demonstrate the effectiveness of SuPreME in leveraging structured, clinically relevant knowledge for high-quality ECG representations. All code and data will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19668v1-abstract-full').style.display = 'none'; document.getElementById('2502.19668v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19568">arXiv:2502.19568</a> <span> [<a href="https://arxiv.org/pdf/2502.19568">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> PhenoProfiler: Advancing Phenotypic Learning for Image-based Drug Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bob Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Minghao Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiliang Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shihang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengran Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qianqian Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19568v1-abstract-short" style="display: inline;"> In the field of image-based drug discovery, capturing the phenotypic response of cells to various drug treatments and perturbations is a crucial step. However, existing methods require computationally extensive and complex multi-step procedures, which can introduce inefficiencies, limit generalizability, and increase potential errors. To address these challenges, we present PhenoProfiler, an innov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19568v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19568v1-abstract-full" style="display: none;"> In the field of image-based drug discovery, capturing the phenotypic response of cells to various drug treatments and perturbations is a crucial step. However, existing methods require computationally extensive and complex multi-step procedures, which can introduce inefficiencies, limit generalizability, and increase potential errors. To address these challenges, we present PhenoProfiler, an innovative model designed to efficiently and effectively extract morphological representations, enabling the elucidation of phenotypic changes induced by treatments. PhenoProfiler is designed as an end-to-end tool that processes whole-slide multi-channel images directly into low-dimensional quantitative representations, eliminating the extensive computational steps required by existing methods. It also includes a multi-objective learning module to enhance robustness, accuracy, and generalization in morphological representation learning. PhenoProfiler is rigorously evaluated on large-scale publicly available datasets, including over 230,000 whole-slide multi-channel images in end-to-end scenarios and more than 8.42 million single-cell images in non-end-to-end settings. Across these benchmarks, PhenoProfiler consistently outperforms state-of-the-art methods by up to 20%, demonstrating substantial improvements in both accuracy and robustness. Furthermore, PhenoProfiler uses a tailored phenotype correction strategy to emphasize relative phenotypic changes under treatments, facilitating the detection of biologically meaningful signals. UMAP visualizations of treatment profiles demonstrate PhenoProfiler ability to effectively cluster treatments with similar biological annotations, thereby enhancing interpretability. These findings establish PhenoProfiler as a scalable, generalizable, and robust tool for phenotypic learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19568v1-abstract-full').style.display = 'none'; document.getElementById('2502.19568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18875">arXiv:2502.18875</a> <span> [<a href="https://arxiv.org/pdf/2502.18875">pdf</a>, <a href="https://arxiv.org/format/2502.18875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SE(3)-Equivariant Ternary Complex Prediction Towards Target Protein Degradation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+F">Fanglei Xue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuqi Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinyu Gao</a>, <a href="/search/cs?searchtype=author&query=Wohlschlegel%2C+J+A">James A. Wohlschlegel</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Weixian Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18875v1-abstract-short" style="display: inline;"> Targeted protein degradation (TPD) induced by small molecules has emerged as a rapidly evolving modality in drug discovery, targeting proteins traditionally considered "undruggable". Proteolysis-targeting chimeras (PROTACs) and molecular glue degraders (MGDs) are the primary small molecules that induce TPD. Both types of molecules form a ternary complex linking an E3 ligase with a target protein,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18875v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18875v1-abstract-full" style="display: none;"> Targeted protein degradation (TPD) induced by small molecules has emerged as a rapidly evolving modality in drug discovery, targeting proteins traditionally considered "undruggable". Proteolysis-targeting chimeras (PROTACs) and molecular glue degraders (MGDs) are the primary small molecules that induce TPD. Both types of molecules form a ternary complex linking an E3 ligase with a target protein, a crucial step for drug discovery. While significant advances have been made in binary structure prediction for proteins and small molecules, ternary structure prediction remains challenging due to obscure interaction mechanisms and insufficient training data. Traditional methods relying on manually assigned rules perform poorly and are computationally demanding due to extensive random sampling. In this work, we introduce DeepTernary, a novel deep learning-based approach that directly predicts ternary structures in an end-to-end manner using an encoder-decoder architecture. DeepTernary leverages an SE(3)-equivariant graph neural network (GNN) with both intra-graph and ternary inter-graph attention mechanisms to capture intricate ternary interactions from our collected high-quality training dataset, TernaryDB. The proposed query-based Pocket Points Decoder extracts the 3D structure of the final binding ternary complex from learned ternary embeddings, demonstrating state-of-the-art accuracy and speed in existing PROTAC benchmarks without prior knowledge from known PROTACs. It also achieves notable accuracy on the more challenging MGD benchmark under the blind docking protocol. Remarkably, our experiments reveal that the buried surface area calculated from predicted structures correlates with experimentally obtained degradation potency-related metrics. Consequently, DeepTernary shows potential in effectively assisting and accelerating the development of TPDs for previously undruggable targets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18875v1-abstract-full').style.display = 'none'; document.getElementById('2502.18875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18489">arXiv:2502.18489</a> <span> [<a href="https://arxiv.org/pdf/2502.18489">pdf</a>, <a href="https://arxiv.org/format/2502.18489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM4EFFI: Leveraging Large Language Models to Enhance Code Efficiency and Correctness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+T">Tong Ye</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weigang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tengfei Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18489v1-abstract-short" style="display: inline;"> Large Language Models (LLMs), particularly Code LLMs, have demonstrated impressive performance in code generation. Current research primarily focuses on the correctness of generated code, while efficiency remains less explored. Recent works have focused on modifying the initial version of the code to improve its efficiency. However, such refinements are limited by the algorithmic design and overal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18489v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18489v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18489v1-abstract-full" style="display: none;"> Large Language Models (LLMs), particularly Code LLMs, have demonstrated impressive performance in code generation. Current research primarily focuses on the correctness of generated code, while efficiency remains less explored. Recent works have focused on modifying the initial version of the code to improve its efficiency. However, such refinements are limited by the algorithmic design and overall logic of the initial code, resulting in only incremental improvements. In contrast, when human developers write high-quality code, they typically begin by designing several potential solutions at the logical level, evaluating various algorithms and their complexities, and then proceeding to implement and optimize the solution. In this study, we introduce \tool: \uline{L}arge \uline{L}anguage \uline{M}odel for Code \uline{Effi}ciency, a novel framework that enables LLMs to generate code that balances both efficiency and correctness. Specifically, \tool divides the efficiency optimization process into two domains: algorithmic exploration in the logic domain and implementation optimization in the code domain. The correctness of the code is then guaranteed through a synthetic test case refinement process. This approach, which prioritizes efficiency before ensuring correctness, offers a new paradigm for efficient code generation. Experiments demonstrate that \tool consistently improves both efficiency and correctness, achieving new state-of-the-art performance in code efficiency benchmarks across various LLM backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18489v1-abstract-full').style.display = 'none'; document.getElementById('2502.18489v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18273">arXiv:2502.18273</a> <span> [<a href="https://arxiv.org/pdf/2502.18273">pdf</a>, <a href="https://arxiv.org/format/2502.18273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond In-Distribution Success: Scaling Curves of CoT Granularity for Language Model Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ru Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Selena Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Iwasawa%2C+Y">Yusuke Iwasawa</a>, <a href="/search/cs?searchtype=author&query=Matsuo%2C+Y">Yutaka Matsuo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaxian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18273v1-abstract-short" style="display: inline;"> Generalization to novel compound tasks under distribution shift is important for deploying transformer-based language models (LMs). This work investigates Chain-of-Thought (CoT) reasoning as a means to enhance OOD generalization. Through controlled experiments across several compound tasks, we reveal three key insights: (1) While QA-trained models achieve near-perfect in-distribution accuracy, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18273v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18273v1-abstract-full" style="display: none;"> Generalization to novel compound tasks under distribution shift is important for deploying transformer-based language models (LMs). This work investigates Chain-of-Thought (CoT) reasoning as a means to enhance OOD generalization. Through controlled experiments across several compound tasks, we reveal three key insights: (1) While QA-trained models achieve near-perfect in-distribution accuracy, their OOD performance degrades catastrophically, even with 10000k+ training examples; (2) the granularity of CoT data strongly correlates with generalization performance; finer-grained CoT data leads to better generalization; (3) CoT exhibits remarkable sample efficiency, matching QA performance with much less (even 80%) data. Theoretically, we demonstrate that compound tasks inherently permit shortcuts in Q-A data that misalign with true reasoning principles, while CoT forces internalization of valid dependency structures, and thus can achieve better generalization. Further, we show that transformer positional embeddings can amplify generalization by emphasizing subtask condition recurrence in long CoT sequences. Our combined theoretical and empirical analysis provides compelling evidence for CoT reasoning as a crucial training paradigm for enabling LM generalization under real-world distributional shifts for compound tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18273v1-abstract-full').style.display = 'none'; document.getElementById('2502.18273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17967">arXiv:2502.17967</a> <span> [<a href="https://arxiv.org/pdf/2502.17967">pdf</a>, <a href="https://arxiv.org/format/2502.17967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> </div> </div> <p class="title is-5 mathjax"> LLM Knows Geometry Better than Algebra: Numerical Understanding of LLM-Based Agents in A Trading Arena </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianmi Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiawei Du</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenxin Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+T">Joey Tianyi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17967v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have significantly improved performance in natural language processing tasks. However, their ability to generalize to dynamic, unseen tasks, particularly in numerical reasoning, remains a challenge. Existing benchmarks mainly evaluate LLMs on problems with predefined optimal solutions, which may not align with real-world scenarios where clear ans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17967v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17967v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have significantly improved performance in natural language processing tasks. However, their ability to generalize to dynamic, unseen tasks, particularly in numerical reasoning, remains a challenge. Existing benchmarks mainly evaluate LLMs on problems with predefined optimal solutions, which may not align with real-world scenarios where clear answers are absent. To bridge this gap, we design the Agent Trading Arena, a virtual numerical game simulating complex economic systems through zero-sum games, where agents invest in stock portfolios. Our experiments reveal that LLMs, including GPT-4o, struggle with algebraic reasoning when dealing with plain-text stock data, often focusing on local details rather than global trends. In contrast, LLMs perform significantly better with geometric reasoning when presented with visual data, such as scatter plots or K-line charts, suggesting that visual representations enhance numerical reasoning. This capability is further improved by incorporating the reflection module, which aids in the analysis and interpretation of complex data. We validate our findings on NASDAQ Stock dataset, where LLMs demonstrate stronger reasoning with visual data compared to text. Our code and data are publicly available at https://github.com/wekjsdvnm/Agent-Trading-Arena.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17967v1-abstract-full').style.display = 'none'; document.getElementById('2502.17967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17089">arXiv:2502.17089</a> <span> [<a href="https://arxiv.org/pdf/2502.17089">pdf</a>, <a href="https://arxiv.org/format/2502.17089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Imprinto: Enhancing Infrared Inkjet Watermarking for Human and Machine Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feick%2C+M">Martin Feick</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xuxin Tang</a>, <a href="/search/cs?searchtype=author&query=Garcia-Martin%2C+R">Raul Garcia-Martin</a>, <a href="/search/cs?searchtype=author&query=Luchianov%2C+A">Alexandru Luchianov</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R+W+X">Roderick Wei Xiao Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chang Xiao</a>, <a href="/search/cs?searchtype=author&query=Siu%2C+A">Alexa Siu</a>, <a href="/search/cs?searchtype=author&query=Dogan%2C+M+D">Mustafa Doga Dogan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17089v1-abstract-short" style="display: inline;"> Hybrid paper interfaces leverage augmented reality to combine the desired tangibility of paper documents with the affordances of interactive digital media. Typically, virtual content can be embedded through direct links (e.g., QR codes); however, this impacts the aesthetics of the paper print and limits the available visual content space. To address this problem, we present Imprinto, an infrared i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17089v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17089v1-abstract-full" style="display: none;"> Hybrid paper interfaces leverage augmented reality to combine the desired tangibility of paper documents with the affordances of interactive digital media. Typically, virtual content can be embedded through direct links (e.g., QR codes); however, this impacts the aesthetics of the paper print and limits the available visual content space. To address this problem, we present Imprinto, an infrared inkjet watermarking technique that allows for invisible content embeddings only by using off-the-shelf IR inks and a camera. Imprinto was established through a psychophysical experiment, studying how much IR ink can be used while remaining invisible to users regardless of background color. We demonstrate that we can detect invisible IR content through our machine learning pipeline, and we developed an authoring tool that optimizes the amount of IR ink on the color regions of an input document for machine and human detectability. Finally, we demonstrate several applications, including augmenting paper documents and objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17089v1-abstract-full').style.display = 'none'; document.getElementById('2502.17089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 13 figures. To appear in the Proceedings of the 2025 ACM CHI Conference on Human Factors in Computing Systems. https://imprinto.github.io</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository