Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 4,678 results for author: <span class="mathjax">Yang, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17258">arXiv:2502.17258</a> <span> [<a href="https://arxiv.org/pdf/2502.17258">pdf</a>, <a href="https://arxiv.org/format/2502.17258">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoGrain: Modulating Space-Time Attention for Multi-grained Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiangpeng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Linchao Zhu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hehe Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17258v1-abstract-short" style="display: inline;"> Recent advancements in diffusion models have significantly improved video generation and editing capabilities. However, multi-grained video editing, which encompasses class-level, instance-level, and part-level modifications, remains a formidable challenge. The major difficulties in multi-grained editing include semantic misalignment of text-to-region control and feature coupling within the diffus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17258v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17258v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17258v1-abstract-full" style="display: none;"> Recent advancements in diffusion models have significantly improved video generation and editing capabilities. However, multi-grained video editing, which encompasses class-level, instance-level, and part-level modifications, remains a formidable challenge. The major difficulties in multi-grained editing include semantic misalignment of text-to-region control and feature coupling within the diffusion model. To address these difficulties, we present VideoGrain, a zero-shot approach that modulates space-time (cross- and self-) attention mechanisms to achieve fine-grained control over video content. We enhance text-to-region control by amplifying each local prompt's attention to its corresponding spatial-disentangled region while minimizing interactions with irrelevant areas in cross-attention. Additionally, we improve feature separation by increasing intra-region awareness and reducing inter-region interference in self-attention. Extensive experiments demonstrate our method achieves state-of-the-art performance in real-world scenarios. Our code, data, and demos are available at https://knightyxp.github.io/VideoGrain_project_page/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17258v1-abstract-full').style.display = 'none'; document.getElementById('2502.17258v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025, code and demos are available at https://knightyxp.github.io/VideoGrain_project_page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17213">arXiv:2502.17213</a> <span> [<a href="https://arxiv.org/pdf/2502.17213">pdf</a>, <a href="https://arxiv.org/format/2502.17213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Powered Electrical Brain Signals Analysis: Advancing Neurological Diagnostics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahe Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Fanqi Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junru Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Daoze Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhizhang Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Fang Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17213v1-abstract-short" style="display: inline;"> Neurological disorders represent significant global health challenges, driving the advancement of brain signal analysis methods. Scalp electroencephalography (EEG) and intracranial electroencephalography (iEEG) are widely used to diagnose and monitor neurological conditions. However, dataset heterogeneity and task variations pose challenges in developing robust deep learning solutions. This review… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17213v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17213v1-abstract-full" style="display: none;"> Neurological disorders represent significant global health challenges, driving the advancement of brain signal analysis methods. Scalp electroencephalography (EEG) and intracranial electroencephalography (iEEG) are widely used to diagnose and monitor neurological conditions. However, dataset heterogeneity and task variations pose challenges in developing robust deep learning solutions. This review systematically examines recent advances in deep learning approaches for EEG/iEEG-based neurological diagnostics, focusing on applications across 7 neurological conditions using 46 datasets. We explore trends in data utilization, model design, and task-specific adaptations, highlighting the importance of pre-trained multi-task models for scalable, generalizable solutions. To advance research, we propose a standardized benchmark for evaluating models across diverse datasets to enhance reproducibility. This survey emphasizes how recent innovations can transform neurological diagnostics and enable the development of intelligent, adaptable healthcare solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17213v1-abstract-full').style.display = 'none'; document.getElementById('2502.17213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17184">arXiv:2502.17184</a> <span> [<a href="https://arxiv.org/pdf/2502.17184">pdf</a>, <a href="https://arxiv.org/format/2502.17184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Measuring Data Diversity for Instruction Tuning: A Systematic Analysis and A Reliable Metric </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuming Yang</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yang Nan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junjie Ye</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+S">Shihan Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Huijie Lv</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17184v1-abstract-short" style="display: inline;"> Data diversity is crucial for the instruction tuning of large language models. Existing studies have explored various diversity-aware data selection methods to construct high-quality datasets and enhance model performance. However, the fundamental problem of precisely defining and measuring data diversity remains underexplored, limiting clear guidance for data engineering. To address this, we syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17184v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17184v1-abstract-full" style="display: none;"> Data diversity is crucial for the instruction tuning of large language models. Existing studies have explored various diversity-aware data selection methods to construct high-quality datasets and enhance model performance. However, the fundamental problem of precisely defining and measuring data diversity remains underexplored, limiting clear guidance for data engineering. To address this, we systematically analyze 11 existing diversity measurement methods by assessing their correlation with model performance through extensive fine-tuning experiments. Our results indicate that a reliable diversity measure should properly account for both inter-sample differences and the information distribution in the sample space. Building on this, we propose NovelSum, a new diversity metric based on sample-level "novelty." Experiments on both simulated and real-world data show that NovelSum accurately captures diversity variations and achieves a 0.97 correlation with instruction-tuned model performance, highlighting its value in guiding data engineering practices. With NovelSum as an optimization objective, we further develop a greedy, diversity-oriented data selection strategy that outperforms existing approaches, validating both the effectiveness and practical significance of our metric. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17184v1-abstract-full').style.display = 'none'; document.getElementById('2502.17184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages. The related codes and resources will be released later. Project page: https://github.com/UmeanNever/NovelSum</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16896">arXiv:2502.16896</a> <span> [<a href="https://arxiv.org/pdf/2502.16896">pdf</a>, <a href="https://arxiv.org/format/2502.16896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Zero-shot Load Forecasting for Integrated Energy Systems: A Large Language Model-based Framework with Multi-task Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Donghe Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Ye Yang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Huan Xi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yu Xiao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=An%2C+D">Dou An</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qingyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16896v1-abstract-short" style="display: inline;"> The growing penetration of renewable energy sources in power systems has increased the complexity and uncertainty of load forecasting, especially for integrated energy systems with multiple energy carriers. Traditional forecasting methods heavily rely on historical data and exhibit limited transferability across different scenarios, posing significant challenges for emerging applications in smart… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16896v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16896v1-abstract-full" style="display: none;"> The growing penetration of renewable energy sources in power systems has increased the complexity and uncertainty of load forecasting, especially for integrated energy systems with multiple energy carriers. Traditional forecasting methods heavily rely on historical data and exhibit limited transferability across different scenarios, posing significant challenges for emerging applications in smart grids and energy internet. This paper proposes the TSLLM-Load Forecasting Mechanism, a novel zero-shot load forecasting framework based on large language models (LLMs) to address these challenges. The framework consists of three key components: a data preprocessing module that handles multi-source energy load data, a time series prompt generation module that bridges the semantic gap between energy data and LLMs through multi-task learning and similarity alignment, and a prediction module that leverages pre-trained LLMs for accurate forecasting. The framework's effectiveness was validated on a real-world dataset comprising load profiles from 20 Australian solar-powered households, demonstrating superior performance in both conventional and zero-shot scenarios. In conventional testing, our method achieved a Mean Squared Error (MSE) of 0.4163 and a Mean Absolute Error (MAE) of 0.3760, outperforming existing approaches by at least 8\%. In zero-shot prediction experiments across 19 households, the framework maintained consistent accuracy with a total MSE of 11.2712 and MAE of 7.6709, showing at least 12\% improvement over current methods. The results validate the framework's potential for accurate and transferable load forecasting in integrated energy systems, particularly beneficial for renewable energy integration and smart grid applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16896v1-abstract-full').style.display = 'none'; document.getElementById('2502.16896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16611">arXiv:2502.16611</a> <span> [<a href="https://arxiv.org/pdf/2502.16611">pdf</a>, <a href="https://arxiv.org/format/2502.16611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Target Speaker Extraction through Comparing Noisy Positive and Negative Audio Enrollments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shitong Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Trigoni%2C+N">Niki Trigoni</a>, <a href="/search/cs?searchtype=author&query=Markham%2C+A">Andrew Markham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16611v1-abstract-short" style="display: inline;"> Target speaker extraction focuses on isolating a specific speaker's voice from an audio mixture containing multiple speakers. To provide information about the target speaker's identity, prior works have utilized clean audio examples as conditioning inputs. However, such clean audio examples are not always readily available (e.g. It is impractical to obtain a clean audio example of a stranger's voi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16611v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16611v1-abstract-full" style="display: none;"> Target speaker extraction focuses on isolating a specific speaker's voice from an audio mixture containing multiple speakers. To provide information about the target speaker's identity, prior works have utilized clean audio examples as conditioning inputs. However, such clean audio examples are not always readily available (e.g. It is impractical to obtain a clean audio example of a stranger's voice at a cocktail party without stepping away from the noisy environment). Limited prior research has explored extracting the target speaker's characteristics from noisy audio examples, which may include overlapping speech from disturbing speakers. In this work, we focus on target speaker extraction when multiple speakers are present during the enrollment stage, through leveraging differences between audio segments where the target speakers are speaking (Positive Enrollments) and segments where they are not (Negative Enrollments). Experiments show the effectiveness of our model architecture and the dedicated pretraining method for the proposed task. Our method achieves state-of-the-art performance in the proposed application settings and demonstrates strong generalizability across challenging and realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16611v1-abstract-full').style.display = 'none'; document.getElementById('2502.16611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 5 figures, appendix included</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16602">arXiv:2502.16602</a> <span> [<a href="https://arxiv.org/pdf/2502.16602">pdf</a>, <a href="https://arxiv.org/format/2502.16602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VidLBEval: Benchmarking and Mitigating Language Bias in Video-Involved LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yangyang Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hui Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16602v1-abstract-short" style="display: inline;"> Recently, Large Vision-Language Models (LVLMs) have made significant strides across diverse multimodal tasks and benchmarks. This paper reveals a largely under-explored problem from existing video-involved LVLMs - language bias, where models tend to prioritize language over video and thus result in incorrect responses. To address this research gap, we first collect a Video Language Bias Evaluation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16602v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16602v1-abstract-full" style="display: none;"> Recently, Large Vision-Language Models (LVLMs) have made significant strides across diverse multimodal tasks and benchmarks. This paper reveals a largely under-explored problem from existing video-involved LVLMs - language bias, where models tend to prioritize language over video and thus result in incorrect responses. To address this research gap, we first collect a Video Language Bias Evaluation Benchmark, which is specifically designed to assess the language bias in video-involved LVLMs through two key tasks: ambiguous video contrast and interrogative question probing. Accordingly, we design accompanied evaluation metrics that aim to penalize LVLMs being biased by language. In addition, we also propose Multi-branch Contrastive Decoding (MCD), introducing two expert branches to simultaneously counteract language bias potentially generated by the amateur text-only branch. Our experiments demonstrate that i) existing video-involved LVLMs, including both proprietary and open-sourced, are largely limited by the language bias problem; ii) our MCD can effectively mitigate this issue and maintain general-purpose capabilities in various video-involved LVLMs without any additional retraining or alteration to model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16602v1-abstract-full').style.display = 'none'; document.getElementById('2502.16602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16528">arXiv:2502.16528</a> <span> [<a href="https://arxiv.org/pdf/2502.16528">pdf</a>, <a href="https://arxiv.org/format/2502.16528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> OpenVox: Real-time Instance-level Open-vocabulary Probabilistic Voxel Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yinan Deng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bicheng Yao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yihang Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yufeng Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16528v1-abstract-short" style="display: inline;"> In recent years, vision-language models (VLMs) have advanced open-vocabulary mapping, enabling mobile robots to simultaneously achieve environmental reconstruction and high-level semantic understanding. While integrated object cognition helps mitigate semantic ambiguity in point-wise feature maps, efficiently obtaining rich semantic understanding and robust incremental reconstruction at the instan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16528v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16528v1-abstract-full" style="display: none;"> In recent years, vision-language models (VLMs) have advanced open-vocabulary mapping, enabling mobile robots to simultaneously achieve environmental reconstruction and high-level semantic understanding. While integrated object cognition helps mitigate semantic ambiguity in point-wise feature maps, efficiently obtaining rich semantic understanding and robust incremental reconstruction at the instance-level remains challenging. To address these challenges, we introduce OpenVox, a real-time incremental open-vocabulary probabilistic instance voxel representation. In the front-end, we design an efficient instance segmentation and comprehension pipeline that enhances language reasoning through encoding captions. In the back-end, we implement probabilistic instance voxels and formulate the cross-frame incremental fusion process into two subtasks: instance association and live map evolution, ensuring robustness to sensor and segmentation noise. Extensive evaluations across multiple datasets demonstrate that OpenVox achieves state-of-the-art performance in zero-shot instance segmentation, semantic segmentation, and open-vocabulary retrieval. Furthermore, real-world robotics experiments validate OpenVox's capability for stable, real-time operation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16528v1-abstract-full').style.display = 'none'; document.getElementById('2502.16528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://open-vox.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16502">arXiv:2502.16502</a> <span> [<a href="https://arxiv.org/pdf/2502.16502">pdf</a>, <a href="https://arxiv.org/format/2502.16502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Subpixel Edge Localization Based on Converted Intensity Summation under Stable Edge Region </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yingyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+G">Guoyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianwen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaiming Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaojun Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16502v1-abstract-short" style="display: inline;"> To satisfy the rigorous requirements of precise edge detection in critical high-accuracy measurements, this article proposes a series of efficient approaches for localizing subpixel edge. In contrast to the fitting based methods, which consider pixel intensity as a sample value derived from a specific model. We take an innovative perspective by assuming that the intensity at the pixel level can be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16502v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16502v1-abstract-full" style="display: none;"> To satisfy the rigorous requirements of precise edge detection in critical high-accuracy measurements, this article proposes a series of efficient approaches for localizing subpixel edge. In contrast to the fitting based methods, which consider pixel intensity as a sample value derived from a specific model. We take an innovative perspective by assuming that the intensity at the pixel level can be interpreted as a local integral mapping in the intensity model for subpixel localization. Consequently, we propose a straightforward subpixel edge localization method called Converted Intensity Summation (CIS). To address the limited robustness associated with focusing solely on the localization of individual edge points, a Stable Edge Region (SER) based algorithm is presented to alleviate local interference near edges. Given the observation that the consistency of edge statistics exists in the local region, the algorithm seeks correlated stable regions in the vicinity of edges to facilitate the acquisition of robust parameters and achieve higher precision positioning. In addition, an edge complement method based on extension-adjustment is also introduced to rectify the irregular edges through the efficient migration of SERs. A large number of experiments are conducted on both synthetic and real image datasets which cover common edge patterns as well as various real scenarios such as industrial PCB images, remote sensing and medical images. It is verified that CIS can achieve higher accuracy than the state-of-the-art method, while requiring less execution time. Moreover, by integrating SER into CIS, the proposed algorithm demonstrates excellent performance in further improving the anti-interference capability and positioning accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16502v1-abstract-full').style.display = 'none'; document.getElementById('2502.16502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16493">arXiv:2502.16493</a> <span> [<a href="https://arxiv.org/pdf/2502.16493">pdf</a>, <a href="https://arxiv.org/format/2502.16493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Trunk-branch Contrastive Network with Multi-view Deformable Aggregation for Multi-view Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yingyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+G">Guoyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaojun Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16493v1-abstract-short" style="display: inline;"> Multi-view action recognition aims to identify actions in a given multi-view scene. Traditional studies initially extracted refined features from each view, followed by implemented paired interaction and integration, but they potentially overlooked the critical local features in each view. When observing objects from multiple perspectives, individuals typically form a comprehensive impression and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16493v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16493v1-abstract-full" style="display: none;"> Multi-view action recognition aims to identify actions in a given multi-view scene. Traditional studies initially extracted refined features from each view, followed by implemented paired interaction and integration, but they potentially overlooked the critical local features in each view. When observing objects from multiple perspectives, individuals typically form a comprehensive impression and subsequently fill in specific details. Drawing inspiration from this cognitive process, we propose a novel trunk-branch contrastive network (TBCNet) for RGB-based multi-view action recognition. Distinctively, TBCNet first obtains fused features in the trunk block and then implicitly supplements vital details provided by the branch block via contrastive learning, generating a more informative and comprehensive action representation. Within this framework, we construct two core components: the multi-view deformable aggregation and the trunk-branch contrastive learning. MVDA employed in the trunk block effectively facilitates multi-view feature fusion and adaptive cross-view spatio-temporal correlation, where a global aggregation module is utilized to emphasize significant spatial information and a composite relative position bias is designed to capture the intra- and cross-view relative positions. Moreover, a trunk-branch contrastive loss is constructed between aggregated features and refined details from each view. By incorporating two distinct weights for positive and negative samples, a weighted trunk-branch contrastive loss is proposed to extract valuable information and emphasize subtle inter-class differences. The effectiveness of TBCNet is verified by extensive experiments on four datasets including NTU-RGB+D 60, NTU-RGB+D 120, PKU-MMD, and N-UCLA dataset. Compared to other RGB-based methods, our approach achieves state-of-the-art performance in cross-subject and cross-setting protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16493v1-abstract-full').style.display = 'none'; document.getElementById('2502.16493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16223">arXiv:2502.16223</a> <span> [<a href="https://arxiv.org/pdf/2502.16223">pdf</a>, <a href="https://arxiv.org/format/2502.16223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt as Knowledge Bank: Boost Vision-language model via Structural Representation for zero-shot medical detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tongfei Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyu Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linlin Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chunyu Xie</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+D">Dawei Leng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xianbin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16223v1-abstract-short" style="display: inline;"> Zero-shot medical detection can further improve detection performance without relying on annotated medical images even upon the fine-tuned model, showing great clinical value. Recent studies leverage grounded vision-language models (GLIP) to achieve this by using detailed disease descriptions as prompts for the target disease name during the inference phase. However, these methods typically treat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16223v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16223v1-abstract-full" style="display: none;"> Zero-shot medical detection can further improve detection performance without relying on annotated medical images even upon the fine-tuned model, showing great clinical value. Recent studies leverage grounded vision-language models (GLIP) to achieve this by using detailed disease descriptions as prompts for the target disease name during the inference phase. However, these methods typically treat prompts as equivalent context to the target name, making it difficult to assign specific disease knowledge based on visual information, leading to a coarse alignment between images and target descriptions. In this paper, we propose StructuralGLIP, which introduces an auxiliary branch to encode prompts into a latent knowledge bank layer-by-layer, enabling more context-aware and fine-grained alignment. Specifically, in each layer, we select highly similar features from both the image representation and the knowledge bank, forming structural representations that capture nuanced relationships between image patches and target descriptions. These features are then fused across modalities to further enhance detection performance. Extensive experiments demonstrate that StructuralGLIP achieves a +4.1\% AP improvement over prior state-of-the-art methods across seven zero-shot medical detection benchmarks, and consistently improves fine-tuned models by +3.2\% AP on endoscopy image datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16223v1-abstract-full').style.display = 'none'; document.getElementById('2502.16223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as ICLR 2025 conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16207">arXiv:2502.16207</a> <span> [<a href="https://arxiv.org/pdf/2502.16207">pdf</a>, <a href="https://arxiv.org/format/2502.16207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Speech Enhancement by Cross- and Sub-band Processing with State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jizhen Li</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+W">Weiping Tu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuhong Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinmeng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiqun Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yanzhen Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16207v1-abstract-short" style="display: inline;"> Recently, the state space model (SSM) represented by Mamba has shown remarkable performance in long-term sequence modeling tasks, including speech enhancement. However, due to substantial differences in sub-band features, applying the same SSM to all sub-bands limits its inference capability. Additionally, when processing each time frame of the time-frequency representation, the SSM may forget cer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16207v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16207v1-abstract-full" style="display: none;"> Recently, the state space model (SSM) represented by Mamba has shown remarkable performance in long-term sequence modeling tasks, including speech enhancement. However, due to substantial differences in sub-band features, applying the same SSM to all sub-bands limits its inference capability. Additionally, when processing each time frame of the time-frequency representation, the SSM may forget certain high-frequency information of low energy, making the restoration of structure in the high-frequency bands challenging. For this reason, we propose Cross- and Sub-band Mamba (CSMamba). To assist the SSM in handling different sub-band features flexibly, we propose a band split block that splits the full-band into four sub-bands with different widths based on their information similarity. We then allocate independent weights to each sub-band, thereby reducing the inference burden on the SSM. Furthermore, to mitigate the forgetting of low-energy information in the high-frequency bands by the SSM, we introduce a spectrum restoration block that enhances the representation of the cross-band features from multiple perspectives. Experimental results on the DNS Challenge 2021 dataset demonstrate that CSMamba outperforms several state-of-the-art (SOTA) speech enhancement methods in three objective evaluation metrics with fewer parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16207v1-abstract-full').style.display = 'none'; document.getElementById('2502.16207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16071">arXiv:2502.16071</a> <span> [<a href="https://arxiv.org/pdf/2502.16071">pdf</a>, <a href="https://arxiv.org/format/2502.16071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Improving Deep Assertion Generation via Fine-Tuning Retrieval-Augmented Pre-trained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Quanjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yi Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yaxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rubing Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yun Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tao Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16071v1-abstract-short" style="display: inline;"> Unit testing validates the correctness of the units of the software system under test and serves as the cornerstone in improving software quality and reliability. To reduce manual efforts in writing unit tests, some techniques have been proposed to automatically generate test assertions, with recent integration-based approaches considered state-of-the-art. Despite being promising, such integration… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16071v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16071v1-abstract-full" style="display: none;"> Unit testing validates the correctness of the units of the software system under test and serves as the cornerstone in improving software quality and reliability. To reduce manual efforts in writing unit tests, some techniques have been proposed to automatically generate test assertions, with recent integration-based approaches considered state-of-the-art. Despite being promising, such integration-based approaches face several limitations, including reliance on lexical matching for assertion retrieval and a limited training corpus for assertion generation. This paper proposes a novel retrieval-augmented deep assertion generation approach, namely RetriGen, based on a hybrid retriever and a pre-trained language model (PLM)-based generator. Given a focal-test, RetriGen first builds a hybrid assertion retriever to search for the most relevant Test-Assert Pair from external codebases. The retrieval process considers lexical similarity and semantical similarity via a token-based and an embedding-based retriever, respectively. RetriGen then treats assertion generation as a sequence-to-sequence task and designs a PLM-based assertion generator to predict a correct assertion. We conduct extensive experiments to evaluate RetriGen against six state-of-the-art approaches across two large-scale datasets and two metrics. The results demonstrate that RetriGen achieves 57.66% accuracy and 73.24% CodeBLEU, outperforming all baselines with average improvements of 50.66% and 14.14%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16071v1-abstract-full').style.display = 'none'; document.getElementById('2502.16071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACM Transactions on Software Engineering and Methodology (TOSEM 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15867">arXiv:2502.15867</a> <span> [<a href="https://arxiv.org/pdf/2502.15867">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Other Quantitative Biology">q-bio.OT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Strategic priorities for transformative progress in advancing biology with proteomics and artificial intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yingying Sun</a>, <a href="/search/cs?searchtype=author&query=A%2C+J">Jun A</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Rui Sun</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+L">Liujia Qian</a>, <a href="/search/cs?searchtype=author&query=Payne%2C+S+H">Samuel H. Payne</a>, <a href="/search/cs?searchtype=author&query=Bittremieux%2C+W">Wout Bittremieux</a>, <a href="/search/cs?searchtype=author&query=Ralser%2C+M">Markus Ralser</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/cs?searchtype=author&query=Perez-Riverol%2C+Y">Yasset Perez-Riverol</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+A">Asif Khan</a>, <a href="/search/cs?searchtype=author&query=Sander%2C+C">Chris Sander</a>, <a href="/search/cs?searchtype=author&query=Aebersold%2C+R">Ruedi Aebersold</a>, <a href="/search/cs?searchtype=author&query=Vizca%C3%ADno%2C+J+A">Juan Antonio Vizca铆no</a>, <a href="/search/cs?searchtype=author&query=Krieger%2C+J+R">Jonathan R Krieger</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jianhua Yao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Han Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yunping Zhu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Y">Yue Xuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B+B">Benjamin Boyang Sun</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+L">Liang Qiao</a>, <a href="/search/cs?searchtype=author&query=Hermjakob%2C+H">Henning Hermjakob</a> , et al. (37 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15867v1-abstract-short" style="display: inline;"> Artificial intelligence (AI) is transforming scientific research, including proteomics. Advances in mass spectrometry (MS)-based proteomics data quality, diversity, and scale, combined with groundbreaking AI techniques, are unlocking new challenges and opportunities in biological discovery. Here, we highlight key areas where AI is driving innovation, from data analysis to new biological insights.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15867v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15867v1-abstract-full" style="display: none;"> Artificial intelligence (AI) is transforming scientific research, including proteomics. Advances in mass spectrometry (MS)-based proteomics data quality, diversity, and scale, combined with groundbreaking AI techniques, are unlocking new challenges and opportunities in biological discovery. Here, we highlight key areas where AI is driving innovation, from data analysis to new biological insights. These include developing an AI-friendly ecosystem for proteomics data generation, sharing, and analysis; improving peptide and protein identification and quantification; characterizing protein-protein interactions and protein complexes; advancing spatial and perturbation proteomics; integrating multi-omics data; and ultimately enabling AI-empowered virtual cells. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15867v1-abstract-full').style.display = 'none'; document.getElementById('2502.15867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 2 figures, perspective in AI proteomics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15679">arXiv:2502.15679</a> <span> [<a href="https://arxiv.org/pdf/2502.15679">pdf</a>, <a href="https://arxiv.org/format/2502.15679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BOSS: Benchmark for Observation Space Shift in Long-Horizon Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Linfeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Mingyu Ding</a>, <a href="/search/cs?searchtype=author&query=Bertasius%2C+G">Gedas Bertasius</a>, <a href="/search/cs?searchtype=author&query=Szafir%2C+D">Daniel Szafir</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15679v1-abstract-short" style="display: inline;"> Robotics has long sought to develop visual-servoing robots capable of completing previously unseen long-horizon tasks. Hierarchical approaches offer a pathway for achieving this goal by executing skill combinations arranged by a task planner, with each visuomotor skill pre-trained using a specific imitation learning (IL) algorithm. However, even in simple long-horizon tasks like skill chaining, hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15679v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15679v1-abstract-full" style="display: none;"> Robotics has long sought to develop visual-servoing robots capable of completing previously unseen long-horizon tasks. Hierarchical approaches offer a pathway for achieving this goal by executing skill combinations arranged by a task planner, with each visuomotor skill pre-trained using a specific imitation learning (IL) algorithm. However, even in simple long-horizon tasks like skill chaining, hierarchical approaches often struggle due to a problem we identify as Observation Space Shift (OSS), where the sequential execution of preceding skills causes shifts in the observation space, disrupting the performance of subsequent individually trained skill policies. To validate OSS and evaluate its impact on long-horizon tasks, we introduce BOSS (a Benchmark for Observation Space Shift). BOSS comprises three distinct challenges: "Single Predicate Shift", "Accumulated Predicate Shift", and "Skill Chaining", each designed to assess a different aspect of OSS's negative effect. We evaluated several recent popular IL algorithms on BOSS, including three Behavioral Cloning methods and the Visual Language Action model OpenVLA. Even on the simplest challenge, we observed average performance drops of 67%, 35%, 34%, and 54%, respectively, when comparing skill performance with and without OSS. Additionally, we investigate a potential solution to OSS that scales up the training data for each skill with a larger and more visually diverse set of demonstrations, with our results showing it is not sufficient to resolve OSS. The project page is: https://boss-benchmark.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15679v1-abstract-full').style.display = 'none'; document.getElementById('2502.15679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15567">arXiv:2502.15567</a> <span> [<a href="https://arxiv.org/pdf/2502.15567">pdf</a>, <a href="https://arxiv.org/format/2502.15567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Model Privacy: A Unified Framework to Understand Model Stealing Attacks and Defenses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ganghua Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuhong Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jie Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15567v1-abstract-short" style="display: inline;"> The use of machine learning (ML) has become increasingly prevalent in various domains, highlighting the importance of understanding and ensuring its safety. One pressing concern is the vulnerability of ML applications to model stealing attacks. These attacks involve adversaries attempting to recover a learned model through limited query-response interactions, such as those found in cloud-based ser… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15567v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15567v1-abstract-full" style="display: none;"> The use of machine learning (ML) has become increasingly prevalent in various domains, highlighting the importance of understanding and ensuring its safety. One pressing concern is the vulnerability of ML applications to model stealing attacks. These attacks involve adversaries attempting to recover a learned model through limited query-response interactions, such as those found in cloud-based services or on-chip artificial intelligence interfaces. While existing literature proposes various attack and defense strategies, these often lack a theoretical foundation and standardized evaluation criteria. In response, this work presents a framework called ``Model Privacy'', providing a foundation for comprehensively analyzing model stealing attacks and defenses. We establish a rigorous formulation for the threat model and objectives, propose methods to quantify the goodness of attack and defense strategies, and analyze the fundamental tradeoffs between utility and privacy in ML models. Our developed theory offers valuable insights into enhancing the security of ML models, especially highlighting the importance of the attack-specific structure of perturbations for effective defenses. We demonstrate the application of model privacy from the defender's perspective through various learning scenarios. Extensive experiments corroborate the insights and the effectiveness of defense mechanisms developed under the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15567v1-abstract-full').style.display = 'none'; document.getElementById('2502.15567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15181">arXiv:2502.15181</a> <span> [<a href="https://arxiv.org/pdf/2502.15181">pdf</a>, <a href="https://arxiv.org/format/2502.15181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Debunking the Myth of Join Ordering: Toward Robust SQL Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+K">Kai Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifei Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiangyao Yu</a>, <a href="/search/cs?searchtype=author&query=Koutris%2C+P">Paraschos Koutris</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huanchen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15181v1-abstract-short" style="display: inline;"> Join order optimization is critical in achieving good query performance. Despite decades of research and practice, modern query optimizers could still generate inferior join plans that are orders of magnitude slower than optimal. Existing research on robust query processing often lacks theoretical guarantees on join-order robustness while sacrificing query performance. In this paper, we rediscover… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15181v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15181v1-abstract-full" style="display: none;"> Join order optimization is critical in achieving good query performance. Despite decades of research and practice, modern query optimizers could still generate inferior join plans that are orders of magnitude slower than optimal. Existing research on robust query processing often lacks theoretical guarantees on join-order robustness while sacrificing query performance. In this paper, we rediscover the recent Predicate Transfer technique from a robustness point of view. We introduce two new algorithms, LargestRoot and SafeSubjoin, and then propose Robust Predicate Transfer (RPT) that is provably robust against arbitrary join orders of an acyclic query. We integrated Robust Predicate Transfer with DuckDB, a state-of-the-art analytical database, and evaluated against all the queries in TPC-H, JOB, and TPC-DS benchmarks. Our experimental results show that RPT improves join-order robustness by orders of magnitude compared to the baseline. With RPT, the largest ratio between the maximum and minimum execution time out of random join orders for a single acyclic query is only 1.6x (the ratio is close to 1 for most evaluated queries). Meanwhile, applying RPT also improves the end-to-end query performance by 1.5x (per-query geometric mean). We hope that this work sheds light on solving the practical join ordering problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15181v1-abstract-full').style.display = 'none'; document.getElementById('2502.15181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15092">arXiv:2502.15092</a> <span> [<a href="https://arxiv.org/pdf/2502.15092">pdf</a>, <a href="https://arxiv.org/format/2502.15092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Singular Spectrum for Large Language Model Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dengjie Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tiancheng Shen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yao Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baisong Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhongying Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Masheng Yang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yujie Zhong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15092v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable capabilities, yet prohibitive parameter complexity often hinders their deployment. Existing singular value decomposition (SVD) based compression methods simply deem singular values as importance scores of decomposed components. However, this importance ordered by singular values does not necessarily correlate with the performance of a downs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15092v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15092v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable capabilities, yet prohibitive parameter complexity often hinders their deployment. Existing singular value decomposition (SVD) based compression methods simply deem singular values as importance scores of decomposed components. However, this importance ordered by singular values does not necessarily correlate with the performance of a downstream task. In this work, we introduce SoCo (Singular spectrum optimization for large language model Compression), a novel compression framework that learns to rescale the decomposed components of SVD in a data-driven manner. Concretely, we employ a learnable diagonal matrix to assign importance scores for singular spectrum and develop a three-stage training process that progressively refines these scores from initial coarse compression to fine-grained sparsification-thereby striking an effective balance between aggressive model compression and performance preservation. Thanks to the learnable singular spectrum, SoCo adaptively prunes components according to the sparsified importance scores, rather than relying on the fixed order of singular values. More importantly, the remaining components with amplified importance scores can compensate for the loss of the pruned ones. Experimental evaluations across multiple LLMs and benchmarks demonstrate that SoCo surpasses the state-of-the-art methods in model compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15092v1-abstract-full').style.display = 'none'; document.getElementById('2502.15092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14864">arXiv:2502.14864</a> <span> [<a href="https://arxiv.org/pdf/2502.14864">pdf</a>, <a href="https://arxiv.org/format/2502.14864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Multimodal RAG through a Chart-based Document Question-Answering Generation Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuming Yang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiang Zhong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Li Jin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwang Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jingpeng Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yang Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Rui Jiang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kaiwen Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14864v1-abstract-short" style="display: inline;"> Multimodal Retrieval-Augmented Generation (MRAG) enhances reasoning capabilities by integrating external knowledge. However, existing benchmarks primarily focus on simple image-text interactions, overlooking complex visual formats like charts that are prevalent in real-world applications. In this work, we introduce a novel task, Chart-based MRAG, to address this limitation. To semi-automatically g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14864v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14864v1-abstract-full" style="display: none;"> Multimodal Retrieval-Augmented Generation (MRAG) enhances reasoning capabilities by integrating external knowledge. However, existing benchmarks primarily focus on simple image-text interactions, overlooking complex visual formats like charts that are prevalent in real-world applications. In this work, we introduce a novel task, Chart-based MRAG, to address this limitation. To semi-automatically generate high-quality evaluation samples, we propose CHARt-based document question-answering GEneration (CHARGE), a framework that produces evaluation data through structured keypoint extraction, crossmodal verification, and keypoint-based generation. By combining CHARGE with expert validation, we construct Chart-MRAG Bench, a comprehensive benchmark for chart-based MRAG evaluation, featuring 4,738 question-answering pairs across 8 domains from real-world documents. Our evaluation reveals three critical limitations in current approaches: (1) unified multimodal embedding retrieval methods struggles in chart-based scenarios, (2) even with ground-truth retrieval, state-of-the-art MLLMs achieve only 58.19% Correctness and 73.87% Coverage scores, and (3) MLLMs demonstrate consistent text-over-visual modality bias during Chart-based MRAG reasoning. The CHARGE and Chart-MRAG Bench are released at https://github.com/Nomothings/CHARGE.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14864v1-abstract-full').style.display = 'none'; document.getElementById('2502.14864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14846">arXiv:2502.14846</a> <span> [<a href="https://arxiv.org/pdf/2502.14846">pdf</a>, <a href="https://arxiv.org/format/2502.14846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Text-Rich Image Understanding via Code-Guided Synthetic Multimodal Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+A">Ajay Patel</a>, <a href="/search/cs?searchtype=author&query=Deitke%2C+M">Matt Deitke</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+T">Tanmay Gupta</a>, <a href="/search/cs?searchtype=author&query=Weihs%2C+L">Luca Weihs</a>, <a href="/search/cs?searchtype=author&query=Head%2C+A">Andrew Head</a>, <a href="/search/cs?searchtype=author&query=Yatskar%2C+M">Mark Yatskar</a>, <a href="/search/cs?searchtype=author&query=Callison-Burch%2C+C">Chris Callison-Burch</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Kembhavi%2C+A">Aniruddha Kembhavi</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+C">Christopher Clark</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14846v1-abstract-short" style="display: inline;"> Reasoning about images with rich text, such as charts and documents, is a critical application of vision-language models (VLMs). However, VLMs often struggle in these domains due to the scarcity of diverse text-rich vision-language data. To address this challenge, we present CoSyn, a framework that leverages the coding capabilities of text-only large language models (LLMs) to automatically create… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14846v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14846v1-abstract-full" style="display: none;"> Reasoning about images with rich text, such as charts and documents, is a critical application of vision-language models (VLMs). However, VLMs often struggle in these domains due to the scarcity of diverse text-rich vision-language data. To address this challenge, we present CoSyn, a framework that leverages the coding capabilities of text-only large language models (LLMs) to automatically create synthetic text-rich multimodal data. Given input text describing a target domain (e.g., "nutrition fact labels"), CoSyn prompts an LLM to generate code (Python, HTML, LaTeX, etc.) for rendering synthetic images. With the underlying code as textual representations of the synthetic images, CoSyn can generate high-quality instruction-tuning data, again relying on a text-only LLM. Using CoSyn, we constructed a dataset comprising 400K images and 2.7M rows of vision-language instruction-tuning data. Comprehensive experiments on seven benchmarks demonstrate that models trained on our synthetic data achieve state-of-the-art performance among competitive open-source models, including Llama 3.2, and surpass proprietary models such as GPT-4V and Gemini 1.5 Flash. Furthermore, CoSyn can produce synthetic pointing data, enabling VLMs to ground information within input images, showcasing its potential for developing multimodal agents capable of acting in real-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14846v1-abstract-full').style.display = 'none'; document.getElementById('2502.14846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 19 figures, 9 tables, website: https://yueyang1996.github.io/cosyn/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14743">arXiv:2502.14743</a> <span> [<a href="https://arxiv.org/pdf/2502.14743">pdf</a>, <a href="https://arxiv.org/ps/2502.14743">ps</a>, <a href="https://arxiv.org/format/2502.14743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Coordination across Diverse Applications: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lijun Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yijun Yang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Q">Qiqi Duan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhui Shi</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chao Lyu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yu-Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14743v2-abstract-short" style="display: inline;"> Multi-agent coordination studies the underlying mechanism enabling the trending spread of diverse multi-agent systems (MAS) and has received increasing attention, driven by the expansion of emerging applications and rapid AI advances. This survey outlines the current state of coordination research across applications through a unified understanding that answers four fundamental coordination questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14743v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14743v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14743v2-abstract-full" style="display: none;"> Multi-agent coordination studies the underlying mechanism enabling the trending spread of diverse multi-agent systems (MAS) and has received increasing attention, driven by the expansion of emerging applications and rapid AI advances. This survey outlines the current state of coordination research across applications through a unified understanding that answers four fundamental coordination questions: (1) what is coordination; (2) why coordination; (3) who to coordinate with; and (4) how to coordinate. Our purpose is to explore existing ideas and expertise in coordination and their connections across diverse applications, while identifying and highlighting emerging and promising research directions. First, general coordination problems that are essential to varied applications are identified and analyzed. Second, a number of MAS applications are surveyed, ranging from widely studied domains, e.g., search and rescue, warehouse automation and logistics, and transportation systems, to emerging fields including humanoid and anthropomorphic robots, satellite systems, and large language models (LLMs). Finally, open challenges about the scalability, heterogeneity, and learning mechanisms of MAS are analyzed and discussed. In particular, we identify the hybridization of hierarchical and decentralized coordination, human-MAS coordination, and LLM-based MAS as promising future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14743v2-abstract-full').style.display = 'none'; document.getElementById('2502.14743v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 4 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14704">arXiv:2502.14704</a> <span> [<a href="https://arxiv.org/pdf/2502.14704">pdf</a>, <a href="https://arxiv.org/format/2502.14704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Not All Data are Good Labels: On the Self-supervised Labeling for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuxuan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dalin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hua Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14704v2-abstract-short" style="display: inline;"> Time Series Forecasting (TSF) is a crucial task in various domains, yet existing TSF models rely heavily on high-quality data and insufficiently exploit all available data. This paper explores a novel self-supervised approach to re-label time series datasets by inherently constructing candidate datasets. During the optimization of a simple reconstruction network, intermediates are used as pseudo l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14704v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14704v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14704v2-abstract-full" style="display: none;"> Time Series Forecasting (TSF) is a crucial task in various domains, yet existing TSF models rely heavily on high-quality data and insufficiently exploit all available data. This paper explores a novel self-supervised approach to re-label time series datasets by inherently constructing candidate datasets. During the optimization of a simple reconstruction network, intermediates are used as pseudo labels in a self-supervised paradigm, improving generalization for any predictor. We introduce the Self-Correction with Adaptive Mask (SCAM), which discards overfitted components and selectively replaces them with pseudo labels generated from reconstructions. Additionally, we incorporate Spectral Norm Regularization (SNR) to further suppress overfitting from a loss landscape perspective. Our experiments on eleven real-world datasets demonstrate that SCAM consistently improves the performance of various backbone models. This work offers a new perspective on constructing datasets and enhancing the generalization of TSF models through self-supervised learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14704v2-abstract-full').style.display = 'none'; document.getElementById('2502.14704v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14344">arXiv:2502.14344</a> <span> [<a href="https://arxiv.org/pdf/2502.14344">pdf</a>, <a href="https://arxiv.org/format/2502.14344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Accurate Binary Spiking Neural Networks: Learning with Adaptive Gradient Modulation Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yu Liang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wenjie Wei</a>, <a href="/search/cs?searchtype=author&query=Belatreche%2C+A">Ammar Belatreche</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Honglin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zijian Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Malu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14344v1-abstract-short" style="display: inline;"> Binary Spiking Neural Networks (BSNNs) inherit the eventdriven paradigm of SNNs, while also adopting the reduced storage burden of binarization techniques. These distinct advantages grant BSNNs lightweight and energy-efficient characteristics, rendering them ideal for deployment on resource-constrained edge devices. However, due to the binary synaptic weights and non-differentiable spike function,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14344v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14344v1-abstract-full" style="display: none;"> Binary Spiking Neural Networks (BSNNs) inherit the eventdriven paradigm of SNNs, while also adopting the reduced storage burden of binarization techniques. These distinct advantages grant BSNNs lightweight and energy-efficient characteristics, rendering them ideal for deployment on resource-constrained edge devices. However, due to the binary synaptic weights and non-differentiable spike function, effectively training BSNNs remains an open question. In this paper, we conduct an in-depth analysis of the challenge for BSNN learning, namely the frequent weight sign flipping problem. To mitigate this issue, we propose an Adaptive Gradient Modulation Mechanism (AGMM), which is designed to reduce the frequency of weight sign flipping by adaptively adjusting the gradients during the learning process. The proposed AGMM can enable BSNNs to achieve faster convergence speed and higher accuracy, effectively narrowing the gap between BSNNs and their full-precision equivalents. We validate AGMM on both static and neuromorphic datasets, and results indicate that it achieves state-of-the-art results among BSNNs. This work substantially reduces storage demands and enhances SNNs' inherent energy efficiency, making them highly feasible for resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14344v1-abstract-full').style.display = 'none'; document.getElementById('2502.14344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 8 figures, AAAI conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14294">arXiv:2502.14294</a> <span> [<a href="https://arxiv.org/pdf/2502.14294">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> DAG: Deep Adaptive and Generative $K$-Free Community Detection on Attributed Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuwen Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yue Ding</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongtao Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wenqing Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziming Wu</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wendong Bi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14294v1-abstract-short" style="display: inline;"> Community detection on attributed graphs with rich semantic and topological information offers great potential for real-world network analysis, especially user matching in online games. Graph Neural Networks (GNNs) have recently enabled Deep Graph Clustering (DGC) methods to learn cluster assignments from semantic and topological information. However, their success depends on the prior knowledge r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14294v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14294v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14294v1-abstract-full" style="display: none;"> Community detection on attributed graphs with rich semantic and topological information offers great potential for real-world network analysis, especially user matching in online games. Graph Neural Networks (GNNs) have recently enabled Deep Graph Clustering (DGC) methods to learn cluster assignments from semantic and topological information. However, their success depends on the prior knowledge related to the number of communities $K$, which is unrealistic due to the high costs and privacy issues of acquisition.In this paper, we investigate the community detection problem without prior $K$, referred to as $K$-Free Community Detection problem. To address this problem, we propose a novel Deep Adaptive and Generative model~(DAG) for community detection without specifying the prior $K$. DAG consists of three key components, \textit{i.e.,} a node representation learning module with masked attribute reconstruction, a community affiliation readout module, and a community number search module with group sparsity. These components enable DAG to convert the process of non-differentiable grid search for the community number, \textit{i.e.,} a discrete hyperparameter in existing DGC methods, into a differentiable learning process. In such a way, DAG can simultaneously perform community detection and community number search end-to-end. To alleviate the cost of acquiring community labels in real-world applications, we design a new metric, EDGE, to evaluate community detection methods even when the labels are not feasible. Extensive offline experiments on five public datasets and a real-world online mobile game dataset demonstrate the superiority of our DAG over the existing state-of-the-art (SOTA) methods. DAG has a relative increase of 7.35\% in teams in a Tencent online game compared with the best competitor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14294v1-abstract-full').style.display = 'none'; document.getElementById('2502.14294v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGKDD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13957">arXiv:2502.13957</a> <span> [<a href="https://arxiv.org/pdf/2502.13957">pdf</a>, <a href="https://arxiv.org/format/2502.13957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RAG-Gym: Optimizing Reasoning and Search Agents with Process Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guangzhi Xiong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qiao Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yin Fang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fangyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhixing Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhiyong Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aidong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13957v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) has shown great potential for knowledge-intensive tasks, but its traditional architectures rely on static retrieval, limiting their effectiveness for complex questions that require sequential information-seeking. While agentic reasoning and search offer a more adaptive approach, most existing methods depend heavily on prompt engineering. In this work, we introd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13957v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13957v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) has shown great potential for knowledge-intensive tasks, but its traditional architectures rely on static retrieval, limiting their effectiveness for complex questions that require sequential information-seeking. While agentic reasoning and search offer a more adaptive approach, most existing methods depend heavily on prompt engineering. In this work, we introduce RAG-Gym, a unified optimization framework that enhances information-seeking agents through fine-grained process supervision at each search step. We also propose ReSearch, a novel agent architecture that synergizes answer reasoning and search query generation within the RAG-Gym framework. Experiments on four challenging datasets show that RAG-Gym improves performance by up to 25.6\% across various agent architectures, with ReSearch consistently outperforming existing baselines. Further analysis highlights the effectiveness of advanced LLMs as process reward judges and the transferability of trained reward models as verifiers for different LLMs. Additionally, we examine the scaling properties of training and inference in agentic RAG. The project homepage is available at https://rag-gym.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13957v1-abstract-full').style.display = 'none'; document.getElementById('2502.13957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13954">arXiv:2502.13954</a> <span> [<a href="https://arxiv.org/pdf/2502.13954">pdf</a>, <a href="https://arxiv.org/format/2502.13954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Latent Distribution Decoupling: A Probabilistic Framework for Uncertainty-Aware Multimodal Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiang Zhong</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Q">Qin Lei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jinpeng Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuming Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sirui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peiguang Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+K">Kaiwen Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13954v1-abstract-short" style="display: inline;"> Multimodal multi-label emotion recognition (MMER) aims to identify the concurrent presence of multiple emotions in multimodal data. Existing studies primarily focus on improving fusion strategies and modeling modality-to-label dependencies. However, they often overlook the impact of \textbf{aleatoric uncertainty}, which is the inherent noise in the multimodal data and hinders the effectiveness of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13954v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13954v1-abstract-full" style="display: none;"> Multimodal multi-label emotion recognition (MMER) aims to identify the concurrent presence of multiple emotions in multimodal data. Existing studies primarily focus on improving fusion strategies and modeling modality-to-label dependencies. However, they often overlook the impact of \textbf{aleatoric uncertainty}, which is the inherent noise in the multimodal data and hinders the effectiveness of modality fusion by introducing ambiguity into feature representations. To address this issue and effectively model aleatoric uncertainty, this paper proposes Latent emotional Distribution Decomposition with Uncertainty perception (LDDU) framework from a novel perspective of latent emotional space probabilistic modeling. Specifically, we introduce a contrastive disentangled distribution mechanism within the emotion space to model the multimodal data, allowing for the extraction of semantic features and uncertainty. Furthermore, we design an uncertainty-aware fusion multimodal method that accounts for the dispersed distribution of uncertainty and integrates distribution information. Experimental results show that LDDU achieves state-of-the-art performance on the CMU-MOSEI and M$^3$ED datasets, highlighting the importance of uncertainty modeling in MMER. Code is available at https://github.com/201983290498/lddu\_mmer.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13954v1-abstract-full').style.display = 'none'; document.getElementById('2502.13954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13925">arXiv:2502.13925</a> <span> [<a href="https://arxiv.org/pdf/2502.13925">pdf</a>, <a href="https://arxiv.org/format/2502.13925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Single Frames: Can LMMs Comprehend Temporal and Contextual Narratives in Image Sequences? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaochen Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jialin Song</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+L">Longyu Guan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yixin Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weiyao Luo</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yifan Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiru Wang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangdi Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13925v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have achieved remarkable success across various visual-language tasks. However, existing benchmarks predominantly focus on single-image understanding, leaving the analysis of image sequences largely unexplored. To address this limitation, we introduce StripCipher, a comprehensive benchmark designed to evaluate capabilities of LMMs to comprehend and reason over sequen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13925v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13925v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have achieved remarkable success across various visual-language tasks. However, existing benchmarks predominantly focus on single-image understanding, leaving the analysis of image sequences largely unexplored. To address this limitation, we introduce StripCipher, a comprehensive benchmark designed to evaluate capabilities of LMMs to comprehend and reason over sequential images. StripCipher comprises a human-annotated dataset and three challenging subtasks: visual narrative comprehension, contextual frame prediction, and temporal narrative reordering. Our evaluation of $16$ state-of-the-art LMMs, including GPT-4o and Qwen2.5VL, reveals a significant performance gap compared to human capabilities, particularly in tasks that require reordering shuffled sequential images. For instance, GPT-4o achieves only 23.93% accuracy in the reordering subtask, which is 56.07% lower than human performance. Further quantitative analysis discuss several factors, such as input format of images, affecting the performance of LLMs in sequential understanding, underscoring the fundamental challenges that remain in the development of LMMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13925v1-abstract-full').style.display = 'none'; document.getElementById('2502.13925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13881">arXiv:2502.13881</a> <span> [<a href="https://arxiv.org/pdf/2502.13881">pdf</a>, <a href="https://arxiv.org/format/2502.13881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PSCon: Toward Conversational Product Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jie Zou</a>, <a href="/search/cs?searchtype=author&query=Aliannejadi%2C+M">Mohammad Aliannejadi</a>, <a href="/search/cs?searchtype=author&query=Kanoulas%2C+E">Evangelos Kanoulas</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shuxi Han</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Heli Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H+T">Heng Tao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13881v1-abstract-short" style="display: inline;"> Conversational Product Search (CPS) is confined to simulated conversations due to the lack of real-world CPS datasets that reflect human-like language. Additionally, current conversational datasets are limited to support cross-market and multi-lingual usage. In this paper, we introduce a new CPS data collection protocol and present PSCon, a novel CPS dataset designed to assist product search via h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13881v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13881v1-abstract-full" style="display: none;"> Conversational Product Search (CPS) is confined to simulated conversations due to the lack of real-world CPS datasets that reflect human-like language. Additionally, current conversational datasets are limited to support cross-market and multi-lingual usage. In this paper, we introduce a new CPS data collection protocol and present PSCon, a novel CPS dataset designed to assist product search via human-like conversations. The dataset is constructed using a coached human-to-human data collection protocol and supports two languages and dual markets. Also, the dataset enables thorough exploration of six subtasks of CPS: user intent detection, keyword extraction, system action prediction, question selection, item ranking, and response generation. Furthermore, we also offer an analysis of the dataset and propose a benchmark model on the proposed CPS dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13881v1-abstract-full').style.display = 'none'; document.getElementById('2502.13881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13794">arXiv:2502.13794</a> <span> [<a href="https://arxiv.org/pdf/2502.13794">pdf</a>, <a href="https://arxiv.org/format/2502.13794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LESA: Learnable LLM Layer Scaling-Up </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifei Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zouying Cao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xinbei Ma</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13794v1-abstract-short" style="display: inline;"> Training Large Language Models (LLMs) from scratch requires immense computational resources, making it prohibitively expensive. Model scaling-up offers a promising solution by leveraging the parameters of smaller models to create larger ones. However, existing depth scaling-up methods rely on empirical heuristic rules for layer duplication, which result in poorer initialization and slower converge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13794v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13794v1-abstract-full" style="display: none;"> Training Large Language Models (LLMs) from scratch requires immense computational resources, making it prohibitively expensive. Model scaling-up offers a promising solution by leveraging the parameters of smaller models to create larger ones. However, existing depth scaling-up methods rely on empirical heuristic rules for layer duplication, which result in poorer initialization and slower convergence during continual pre-training. We propose \textbf{LESA}, a novel learnable method for depth scaling-up. By concatenating parameters from each layer and applying Singular Value Decomposition, we uncover latent patterns between layers, suggesting that inter-layer parameters can be learned. LESA uses a neural network to predict the parameters inserted between adjacent layers, enabling better initialization and faster training. Experiments show that LESA outperforms existing baselines, achieving superior performance with less than half the computational cost during continual pre-training. Extensive analyses demonstrate its effectiveness across different model sizes and tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13794v1-abstract-full').style.display = 'none'; document.getElementById('2502.13794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13569">arXiv:2502.13569</a> <span> [<a href="https://arxiv.org/pdf/2502.13569">pdf</a>, <a href="https://arxiv.org/format/2502.13569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Model Evolution Framework with Genetic Algorithm for Multi-Task Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wengang Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wanxuan Lu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yingyan Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Houqiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13569v1-abstract-short" style="display: inline;"> Multi-task reinforcement learning employs a single policy to complete various tasks, aiming to develop an agent with generalizability across different scenarios. Given the shared characteristics of tasks, the agent's learning efficiency can be enhanced through parameter sharing. Existing approaches typically use a routing network to generate specific routes for each task and reconstruct a set of m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13569v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13569v1-abstract-full" style="display: none;"> Multi-task reinforcement learning employs a single policy to complete various tasks, aiming to develop an agent with generalizability across different scenarios. Given the shared characteristics of tasks, the agent's learning efficiency can be enhanced through parameter sharing. Existing approaches typically use a routing network to generate specific routes for each task and reconstruct a set of modules into diverse models to complete multiple tasks simultaneously. However, due to the inherent difference between tasks, it is crucial to allocate resources based on task difficulty, which is constrained by the model's structure. To this end, we propose a Model Evolution framework with Genetic Algorithm (MEGA), which enables the model to evolve during training according to the difficulty of the tasks. When the current model is insufficient for certain tasks, the framework will automatically incorporate additional modules, enhancing the model's capabilities. Moreover, to adapt to our model evolution framework, we introduce a genotype module-level model, using binary sequences as genotype policies for model reconstruction, while leveraging a non-gradient genetic algorithm to optimize these genotype policies. Unlike routing networks with fixed output dimensions, our approach allows for the dynamic adjustment of the genotype policy length, enabling it to accommodate models with a varying number of modules. We conducted experiments on various robotics manipulation tasks in the Meta-World benchmark. Our state-of-the-art performance demonstrated the effectiveness of the MEGA framework. We will release our source code to the public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13569v1-abstract-full').style.display = 'none'; document.getElementById('2502.13569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13491">arXiv:2502.13491</a> <span> [<a href="https://arxiv.org/pdf/2502.13491">pdf</a>, <a href="https://arxiv.org/format/2502.13491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Cloth Animation with Time-dependent Persistent Wrinkles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+D">Deshan Gong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yin Yang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+T">Tianjia Shao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">He Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13491v1-abstract-short" style="display: inline;"> Persistent wrinkles are often observed on crumpled garments e.g., the wrinkles around the knees after sitting for a while. Such wrinkles can be easily recovered if not deformed for long, and otherwise be persistent. Since they are vital to the visual realism of cloth animation, we aim to simulate realistic looking persistent wrinkles. To this end, we present a physics-inspired fine-grained wrinkle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13491v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13491v1-abstract-full" style="display: none;"> Persistent wrinkles are often observed on crumpled garments e.g., the wrinkles around the knees after sitting for a while. Such wrinkles can be easily recovered if not deformed for long, and otherwise be persistent. Since they are vital to the visual realism of cloth animation, we aim to simulate realistic looking persistent wrinkles. To this end, we present a physics-inspired fine-grained wrinkle model. Different from existing methods, we recognize the importance of the interplay between internal friction and plasticity during wrinkle formation. Furthermore, we model their time dependence for persistent wrinkles. Our model is capable of not only simulating realistic wrinkle patterns, but also their time-dependent changes according to how long the deformation is maintained. Through extensive experiments, we show that our model is effective in simulating realistic spatial and temporal varying wrinkles, versatile in simulating different materials, and capable of generating more fine-grained wrinkles than the state of the art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13491v1-abstract-full').style.display = 'none'; document.getElementById('2502.13491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12677">arXiv:2502.12677</a> <span> [<a href="https://arxiv.org/pdf/2502.12677">pdf</a>, <a href="https://arxiv.org/format/2502.12677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Spiking Vision Transformer with Saccadic Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Malu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dehao Zhang</a>, <a href="/search/cs?searchtype=author&query=Belatreche%2C+A">Ammar Belatreche</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yichen Xiao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yu Liang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yimeng Shan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qian Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+E">Enqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12677v1-abstract-short" style="display: inline;"> The combination of Spiking Neural Networks (SNNs) and Vision Transformers (ViTs) holds potential for achieving both energy efficiency and high performance, particularly suitable for edge vision applications. However, a significant performance gap still exists between SNN-based ViTs and their ANN counterparts. Here, we first analyze why SNN-based ViTs suffer from limited performance and identify a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12677v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12677v1-abstract-full" style="display: none;"> The combination of Spiking Neural Networks (SNNs) and Vision Transformers (ViTs) holds potential for achieving both energy efficiency and high performance, particularly suitable for edge vision applications. However, a significant performance gap still exists between SNN-based ViTs and their ANN counterparts. Here, we first analyze why SNN-based ViTs suffer from limited performance and identify a mismatch between the vanilla self-attention mechanism and spatio-temporal spike trains. This mismatch results in degraded spatial relevance and limited temporal interactions. To address these issues, we draw inspiration from biological saccadic attention mechanisms and introduce an innovative Saccadic Spike Self-Attention (SSSA) method. Specifically, in the spatial domain, SSSA employs a novel spike distribution-based method to effectively assess the relevance between Query and Key pairs in SNN-based ViTs. Temporally, SSSA employs a saccadic interaction module that dynamically focuses on selected visual areas at each timestep and significantly enhances whole scene understanding through temporal interactions. Building on the SSSA mechanism, we develop a SNN-based Vision Transformer (SNN-ViT). Extensive experiments across various visual tasks demonstrate that SNN-ViT achieves state-of-the-art performance with linear computational complexity. The effectiveness and efficiency of the SNN-ViT highlight its potential for power-critical edge vision applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12677v1-abstract-full').style.display = 'none'; document.getElementById('2502.12677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12478">arXiv:2502.12478</a> <span> [<a href="https://arxiv.org/pdf/2502.12478">pdf</a>, <a href="https://arxiv.org/format/2502.12478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MSE-Adapter: A Lightweight Plugin Endowing LLMs with the Capability to Perform Multimodal Sentiment Analysis and Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xunde Dong</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+Y">Yupeng Qiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12478v1-abstract-short" style="display: inline;"> Current Multimodal Sentiment Analysis (MSA) and Emotion Recognition in Conversations (ERC) methods based on pre-trained language models exhibit two primary limitations: 1) Once trained for MSA and ERC tasks, these pre-trained language models lose their original generalized capabilities. 2) They demand considerable computational resources. As the size of pre-trained language models continues to g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12478v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12478v1-abstract-full" style="display: none;"> Current Multimodal Sentiment Analysis (MSA) and Emotion Recognition in Conversations (ERC) methods based on pre-trained language models exhibit two primary limitations: 1) Once trained for MSA and ERC tasks, these pre-trained language models lose their original generalized capabilities. 2) They demand considerable computational resources. As the size of pre-trained language models continues to grow, training larger multimodal sentiment analysis models using previous approaches could result in unnecessary computational cost. In response to this challenge, we propose \textbf{M}ultimodal \textbf{S}entiment Analysis and \textbf{E}motion Recognition \textbf{Adapter} (MSE-Adapter), a lightweight and adaptable plugin. This plugin enables a large language model (LLM) to carry out MSA or ERC tasks with minimal computational overhead (only introduces approximately 2.6M to 2.8M trainable parameters upon the 6/7B models), while preserving the intrinsic capabilities of the LLM. In the MSE-Adapter, the Text-Guide-Mixer (TGM) module is introduced to establish explicit connections between non-textual and textual modalities through the Hadamard product. This allows non-textual modalities to better align with textual modalities at the feature level, promoting the generation of higher-quality pseudo tokens. Extensive experiments were conducted on four public English and Chinese datasets using consumer-grade GPUs and open-source LLMs (Qwen-1.8B, ChatGLM3-6B-base, and LLaMA2-7B) as the backbone. The results demonstrate the effectiveness of the proposed plugin. The code will be released on GitHub after a blind review. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12478v1-abstract-full').style.display = 'none'; document.getElementById('2502.12478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12346">arXiv:2502.12346</a> <span> [<a href="https://arxiv.org/pdf/2502.12346">pdf</a>, <a href="https://arxiv.org/format/2502.12346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QuZO: Quantized Zeroth-Order Fine-Tuning for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiajun Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+K">Kai Zhen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyue Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yequan Zhao</a>, <a href="/search/cs?searchtype=author&query=Banijamali%2C+E">Ershad Banijamali</a>, <a href="/search/cs?searchtype=author&query=Mouchtaris%2C+A">Athanasios Mouchtaris</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+N">Ngai Wong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12346v1-abstract-short" style="display: inline;"> Language Models (LLMs) are often quantized to lower precision to reduce the memory cost and latency in inference. However, quantization often degrades model performance, thus fine-tuning is required for various down-stream tasks. Traditional fine-tuning methods such as stochastic gradient descent and Adam optimization require backpropagation, which are error-prone in the low-precision settings. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12346v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12346v1-abstract-full" style="display: none;"> Language Models (LLMs) are often quantized to lower precision to reduce the memory cost and latency in inference. However, quantization often degrades model performance, thus fine-tuning is required for various down-stream tasks. Traditional fine-tuning methods such as stochastic gradient descent and Adam optimization require backpropagation, which are error-prone in the low-precision settings. To overcome these limitations, we propose the Quantized Zeroth-Order (QuZO) framework, specifically designed for fine-tuning LLMs through low-precision (e.g., 4- or 8-bit) forward passes. Our method can avoid the error-prone low-precision straight-through estimator, and utilizes optimized stochastic rounding to mitigate the increased bias. QuZO simplifies the training process, while achieving results comparable to first-order methods in ${\rm FP}8$ and superior accuracy in ${\rm INT}8$ and ${\rm INT}4$ training. Experiments demonstrate that low-bit training QuZO achieves performance comparable to MeZO optimization on GLUE, Multi-Choice, and Generation tasks, while reducing memory cost by $2.94 \times$ in LLaMA2-7B fine-tuning compared to quantized first-order methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12346v1-abstract-full').style.display = 'none'; document.getElementById('2502.12346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12151">arXiv:2502.12151</a> <span> [<a href="https://arxiv.org/pdf/2502.12151">pdf</a>, <a href="https://arxiv.org/format/2502.12151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> VoLUT: Efficient Volumetric streaming enhanced by LUT-based super-resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chendong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Anlan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lili Qiu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinyang Jiang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+F">Feng Qian</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+S">Suman Banerjee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12151v1-abstract-short" style="display: inline;"> 3D volumetric video provides immersive experience and is gaining traction in digital media. Despite its rising popularity, the streaming of volumetric video content poses significant challenges due to the high data bandwidth requirement. A natural approach to mitigate the bandwidth issue is to reduce the volumetric video's data rate by downsampling the content prior to transmission. The video can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12151v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12151v1-abstract-full" style="display: none;"> 3D volumetric video provides immersive experience and is gaining traction in digital media. Despite its rising popularity, the streaming of volumetric video content poses significant challenges due to the high data bandwidth requirement. A natural approach to mitigate the bandwidth issue is to reduce the volumetric video's data rate by downsampling the content prior to transmission. The video can then be upsampled at the receiver's end using a super-resolution (SR) algorithm to reconstruct the high-resolution details. While super-resolution techniques have been extensively explored and advanced for 2D video content, there is limited work on SR algorithms tailored for volumetric videos. To address this gap and the growing need for efficient volumetric video streaming, we have developed VoLUT with a new SR algorithm specifically designed for volumetric content. Our algorithm uniquely harnesses the power of lookup tables (LUTs) to facilitate the efficient and accurate upscaling of low-resolution volumetric data. The use of LUTs enables our algorithm to quickly reference precomputed high-resolution values, thereby significantly reducing the computational complexity and time required for upscaling. We further apply adaptive video bit rate algorithm (ABR) to dynamically determine the downsampling rate according to the network condition and stream the selected video rate to the receiver. Compared to related work, VoLUT is the first to enable high-quality 3D SR on commodity mobile devices at line-rate. Our evaluation shows VoLUT can reduce bandwidth usage by 70% , boost QoE by 36.7% for volumetric video streaming and achieve 3D SR speed-up with no quality compromise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12151v1-abstract-full').style.display = 'none'; document.getElementById('2502.12151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12007">arXiv:2502.12007</a> <span> [<a href="https://arxiv.org/pdf/2502.12007">pdf</a>, <a href="https://arxiv.org/ps/2502.12007">ps</a>, <a href="https://arxiv.org/format/2502.12007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Demographic Attributes Prediction from Speech Using WavLM Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuchen Yang</a>, <a href="/search/cs?searchtype=author&query=Thebaud%2C+T">Thomas Thebaud</a>, <a href="/search/cs?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12007v1-abstract-short" style="display: inline;"> This paper introduces a general classifier based on WavLM features, to infer demographic characteristics, such as age, gender, native language, education, and country, from speech. Demographic feature prediction plays a crucial role in applications like language learning, accessibility, and digital forensics, enabling more personalized and inclusive technologies. Leveraging pretrained models for e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12007v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12007v1-abstract-full" style="display: none;"> This paper introduces a general classifier based on WavLM features, to infer demographic characteristics, such as age, gender, native language, education, and country, from speech. Demographic feature prediction plays a crucial role in applications like language learning, accessibility, and digital forensics, enabling more personalized and inclusive technologies. Leveraging pretrained models for embedding extraction, the proposed framework identifies key acoustic and linguistic fea-tures associated with demographic attributes, achieving a Mean Absolute Error (MAE) of 4.94 for age prediction and over 99.81% accuracy for gender classification across various datasets. Our system improves upon existing models by up to relative 30% in MAE and up to relative 10% in accuracy and F1 scores across tasks, leveraging a diverse range of datasets and large pretrained models to ensure robustness and generalizability. This study offers new insights into speaker diversity and provides a strong foundation for future research in speech-based demographic profiling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12007v1-abstract-full').style.display = 'none'; document.getElementById('2502.12007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, accepted by The Conference on Information Sciences and Systems (CISS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11775">arXiv:2502.11775</a> <span> [<a href="https://arxiv.org/pdf/2502.11775">pdf</a>, <a href="https://arxiv.org/format/2502.11775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> video-SALMONN-o1: Reasoning-enhanced Audio-visual Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jimin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Changli Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=MA%2C+Z">Zejun MA</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11775v1-abstract-short" style="display: inline;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11775v1-abstract-full" style="display: none;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhanced audio-visual LLM designed for general video understanding tasks. To enhance its reasoning abilities, we develop a reasoning-intensive dataset featuring challenging audio-visual questions with step-by-step solutions. We also propose process direct preference optimization (pDPO), which leverages contrastive step selection to achieve efficient step-level reward modelling tailored for multimodal inputs. Additionally, we introduce RivaBench, the first reasoning-intensive video understanding benchmark, featuring over 4,000 high-quality, expert-curated question-answer pairs across scenarios such as standup comedy, academic presentations, and synthetic video detection. video-SALMONN-o1 achieves 3-8% accuracy improvements over the LLaVA-OneVision baseline across different video reasoning benchmarks. Besides, pDPO achieves 6-8% improvements compared to the supervised fine-tuning model on RivaBench. Enhanced reasoning enables video-SALMONN-o1 zero-shot synthetic video detection capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'none'; document.getElementById('2502.11775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11513">arXiv:2502.11513</a> <span> [<a href="https://arxiv.org/pdf/2502.11513">pdf</a>, <a href="https://arxiv.org/format/2502.11513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MaZO: Masked Zeroth-Order Optimization for Multi-Task Fine-Tuning of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+K">Kai Zhen</a>, <a href="/search/cs?searchtype=author&query=Susanj%2C+N">Nathan Susanj</a>, <a href="/search/cs?searchtype=author&query=Mouchtaris%2C+A">Athanasios Mouchtaris</a>, <a href="/search/cs?searchtype=author&query=Kunzmann%2C+S">Siegfried Kunzmann</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11513v1-abstract-short" style="display: inline;"> Large language models have demonstrated exceptional capabilities across diverse tasks, but their fine-tuning demands significant memory, posing challenges for resource-constrained environments. Zeroth-order (ZO) optimization provides a memory-efficient alternative by eliminating the need for backpropagation. However, ZO optimization suffers from high gradient variance, and prior research has large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11513v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11513v1-abstract-full" style="display: none;"> Large language models have demonstrated exceptional capabilities across diverse tasks, but their fine-tuning demands significant memory, posing challenges for resource-constrained environments. Zeroth-order (ZO) optimization provides a memory-efficient alternative by eliminating the need for backpropagation. However, ZO optimization suffers from high gradient variance, and prior research has largely focused on single-task learning, leaving its application to multi-task learning unexplored. Multi-task learning is crucial for leveraging shared knowledge across tasks to improve generalization, yet it introduces unique challenges under ZO settings, such as amplified gradient variance and collinearity. In this paper, we present MaZO, the first framework specifically designed for multi-task LLM fine-tuning under ZO optimization. MaZO tackles these challenges at the parameter level through two key innovations: a weight importance metric to identify critical parameters and a multi-task weight update mask to selectively update these parameters, reducing the dimensionality of the parameter space and mitigating task conflicts. Experiments demonstrate that MaZO achieves state-of-the-art performance, surpassing even multi-task learning methods designed for first-order optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11513v1-abstract-full').style.display = 'none'; document.getElementById('2502.11513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11487">arXiv:2502.11487</a> <span> [<a href="https://arxiv.org/pdf/2502.11487">pdf</a>, <a href="https://arxiv.org/format/2502.11487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Non-Binary LDPC Arithmetic Error Correction For Processing-in-Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+D">Daijing Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yihang Zhu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Anjunyi Fan</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yaoyu Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuchao Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+B">Bonan Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11487v1-abstract-short" style="display: inline;"> Processing-in-memory (PIM) based on emerging devices such as memristors is more vulnerable to noise than traditional memories, due to the physical non-idealities and complex operations in analog domains. To ensure high reliability, efficient error-correcting code (ECC) is highly desired. However, state-of-the-art ECC schemes for PIM suffer drawbacks including dataflow interruptions, low code rates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11487v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11487v1-abstract-full" style="display: none;"> Processing-in-memory (PIM) based on emerging devices such as memristors is more vulnerable to noise than traditional memories, due to the physical non-idealities and complex operations in analog domains. To ensure high reliability, efficient error-correcting code (ECC) is highly desired. However, state-of-the-art ECC schemes for PIM suffer drawbacks including dataflow interruptions, low code rates, and limited error correction patterns. In this work, we propose non-binary low-density parity-check (NB-LDPC) error correction running over the Galois field. Such NB-LDPC scheme with a long word length of 1024 bits can correct up to 8-bit errors with a code rate over 88%. Nonbinary GF operations can support both memory mode and PIM mode even with multi-level memory cells. We fabricate a 40nm prototype PIM chip equipped with our proposed NB-LDPC scheme for validation purposes. Experiments show that PIM with NB-LDPC error correction demonstrates up to 59.65 times bit error rate (BER) improvement over the original PIM without such error correction. The test chip delivers 2.978 times power efficiency enhancement over prior works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11487v1-abstract-full').style.display = 'none'; document.getElementById('2502.11487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11480">arXiv:2502.11480</a> <span> [<a href="https://arxiv.org/pdf/2502.11480">pdf</a>, <a href="https://arxiv.org/format/2502.11480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Offline Model-Based RL via Active Model Selection: A Bayesian Optimization Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu-Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+Y">Yun-Ming Chan</a>, <a href="/search/cs?searchtype=author&query=Hung%2C+W">Wei Hung</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+P">Ping-Chun Hsieh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11480v1-abstract-short" style="display: inline;"> Offline model-based reinforcement learning (MBRL) serves as a competitive framework that can learn well-performing policies solely from pre-collected data with the help of learned dynamics models. To fully unleash the power of offline MBRL, model selection plays a pivotal role in determining the dynamics model utilized for downstream policy learning. However, offline MBRL conventionally relies on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11480v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11480v1-abstract-full" style="display: none;"> Offline model-based reinforcement learning (MBRL) serves as a competitive framework that can learn well-performing policies solely from pre-collected data with the help of learned dynamics models. To fully unleash the power of offline MBRL, model selection plays a pivotal role in determining the dynamics model utilized for downstream policy learning. However, offline MBRL conventionally relies on validation or off-policy evaluation, which are rather inaccurate due to the inherent distribution shift in offline RL. To tackle this, we propose BOMS, an active model selection framework that enhances model selection in offline MBRL with only a small online interaction budget, through the lens of Bayesian optimization (BO). Specifically, we recast model selection as BO and enable probabilistic inference in BOMS by proposing a novel model-induced kernel, which is theoretically grounded and computationally efficient. Through extensive experiments, we show that BOMS improves over the baseline methods with a small amount of online interaction comparable to only $1\%$-$2.5\%$ of offline training data on various RL tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11480v1-abstract-full').style.display = 'none'; document.getElementById('2502.11480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11420">arXiv:2502.11420</a> <span> [<a href="https://arxiv.org/pdf/2502.11420">pdf</a>, <a href="https://arxiv.org/format/2502.11420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training-Free Guidance Beyond Differentiability: Scalable Path Steering with Tree Search in Diffusion and Flow Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yingqing Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yukang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11420v1-abstract-short" style="display: inline;"> Training-free guidance enables controlled generation in diffusion and flow models, but most existing methods assume differentiable objectives and rely on gradients. This work focuses on training-free guidance addressing challenges from non-differentiable objectives and discrete data distributions. We propose an algorithmic framework TreeG: Tree Search-Based Path Steering Guidance, applicable to bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11420v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11420v1-abstract-full" style="display: none;"> Training-free guidance enables controlled generation in diffusion and flow models, but most existing methods assume differentiable objectives and rely on gradients. This work focuses on training-free guidance addressing challenges from non-differentiable objectives and discrete data distributions. We propose an algorithmic framework TreeG: Tree Search-Based Path Steering Guidance, applicable to both continuous and discrete settings in diffusion and flow models. TreeG offers a unified perspective on training-free guidance: proposing candidates for the next step, evaluating candidates, and selecting the best to move forward, enhanced by a tree search mechanism over active paths or parallelizing exploration. We comprehensively investigate the design space of TreeG over the candidate proposal module and the evaluation function, instantiating TreeG into three novel algorithms. Our experiments show that TreeG consistently outperforms the top guidance baselines in symbolic music generation, small molecule generation, and enhancer DNA design, all of which involve non-differentiable challenges. Additionally, we identify an inference-time scaling law showing TreeG's scalability in inference-time computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11420v1-abstract-full').style.display = 'none'; document.getElementById('2502.11420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11407">arXiv:2502.11407</a> <span> [<a href="https://arxiv.org/pdf/2502.11407">pdf</a>, <a href="https://arxiv.org/format/2502.11407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Gensor: A Graph-based Construction Tensor Compilation Method for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hangda Liu</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+B">Boyu Diao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenxin Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaohui Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongjun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11407v1-abstract-short" style="display: inline;"> High-performance deep learning depends on efficient tensor programs. In recent years, automatic tensor program optimization, also known as tensor compilation, has emerged as the primary approach to generating efficient tensor programs. However, how to generate kernels with higher performance in a shorter time is still the key challenge. In this paper, we present Gensor, a graph-based construction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11407v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11407v1-abstract-full" style="display: none;"> High-performance deep learning depends on efficient tensor programs. In recent years, automatic tensor program optimization, also known as tensor compilation, has emerged as the primary approach to generating efficient tensor programs. However, how to generate kernels with higher performance in a shorter time is still the key challenge. In this paper, we present Gensor, a graph-based construction tensor compilation method for deep learning, to further improve the performance of construction tensor compilation. Unlike existing tree-based methods, Gensor abstracts construction space into a graph structure. Gensor then explores the construction space with Markov analysis. Gensor takes tensor programs as states and models scheduling primitives as transition actions between these states. Therefore, the process of tensor program construction optimization is abstracted as a graph traversal process. This approach expands the optimization space, improving operator performance while ensuring rapid optimization. Extensive experiments with typical operators demonstrate that Gensor significantly outperforms the state-of-the-art methods on GPUs for both cloud servers and edge devices. As a result, Gensor can generate operator kernels in seconds, with performance increasing by 18\% on average, reaching a maximum of 30\%. It also achieves high speedup for end-to-end models like ResNet-50 and GPT-2, with an average acceleration of 20\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11407v1-abstract-full').style.display = 'none'; document.getElementById('2502.11407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11287">arXiv:2502.11287</a> <span> [<a href="https://arxiv.org/pdf/2502.11287">pdf</a>, <a href="https://arxiv.org/format/2502.11287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MC-BEVRO: Multi-Camera Bird Eye View Road Occupancy Detection for Traffic Monitoring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vaghela%2C+A">Arpitsinh Vaghela</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Duo Lu</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+A+A">Aayush Atul Verma</a>, <a href="/search/cs?searchtype=author&query=Chakravarthi%2C+B">Bharatesh Chakravarthi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hua Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yezhou Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11287v1-abstract-short" style="display: inline;"> Single camera 3D perception for traffic monitoring faces significant challenges due to occlusion and limited field of view. Moreover, fusing information from multiple cameras at the image feature level is difficult because of different view angles. Further, the necessity for practical implementation and compatibility with existing traffic infrastructure compounds these challenges. To address these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11287v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11287v1-abstract-full" style="display: none;"> Single camera 3D perception for traffic monitoring faces significant challenges due to occlusion and limited field of view. Moreover, fusing information from multiple cameras at the image feature level is difficult because of different view angles. Further, the necessity for practical implementation and compatibility with existing traffic infrastructure compounds these challenges. To address these issues, this paper introduces a novel Bird's-Eye-View road occupancy detection framework that leverages multiple roadside cameras to overcome the aforementioned limitations. To facilitate the framework's development and evaluation, a synthetic dataset featuring diverse scenes and varying camera configurations is generated using the CARLA simulator. A late fusion and three early fusion methods were implemented within the proposed framework, with performance further enhanced by integrating backgrounds. Extensive evaluations were conducted to analyze the impact of multi-camera inputs and varying BEV occupancy map sizes on model performance. Additionally, a real-world data collection pipeline was developed to assess the model's ability to generalize to real-world environments. The sim-to-real capabilities of the model were evaluated using zero-shot and few-shot fine-tuning, demonstrating its potential for practical application. This research aims to advance perception systems in traffic monitoring, contributing to improved traffic management, operational efficiency, and road safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11287v1-abstract-full').style.display = 'none'; document.getElementById('2502.11287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11128">arXiv:2502.11128</a> <span> [<a href="https://arxiv.org/pdf/2502.11128">pdf</a>, <a href="https://arxiv.org/format/2502.11128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FELLE: Autoregressive Speech Synthesis with Token-Wise Coarse-to-Fine Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yan Lu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11128v1-abstract-short" style="display: inline;"> To advance continuous-valued token modeling and temporal-coherence enforcement, we propose FELLE, an autoregressive model that integrates language modeling with token-wise flow matching. By leveraging the autoregressive nature of language models and the generative efficacy of flow matching, FELLE effectively predicts continuous-valued tokens (mel-spectrograms). For each continuous-valued token, FE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11128v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11128v1-abstract-full" style="display: none;"> To advance continuous-valued token modeling and temporal-coherence enforcement, we propose FELLE, an autoregressive model that integrates language modeling with token-wise flow matching. By leveraging the autoregressive nature of language models and the generative efficacy of flow matching, FELLE effectively predicts continuous-valued tokens (mel-spectrograms). For each continuous-valued token, FELLE modifies the general prior distribution in flow matching by incorporating information from the previous step, improving coherence and stability. Furthermore, to enhance synthesis quality, FELLE introduces a coarse-to-fine flow-matching mechanism, generating continuous-valued tokens hierarchically, conditioned on the language model's output. Experimental results demonstrate the potential of incorporating flow-matching techniques in autoregressive mel-spectrogram modeling, leading to significant improvements in TTS generation quality, as shown in https://aka.ms/felle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11128v1-abstract-full').style.display = 'none'; document.getElementById('2502.11128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11094">arXiv:2502.11094</a> <span> [<a href="https://arxiv.org/pdf/2502.11094">pdf</a>, <a href="https://arxiv.org/format/2502.11094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SyncSpeech: Low-Latency and Efficient Dual-Stream Text-to-Speech based on Temporal Masked Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhenhua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11094v1-abstract-short" style="display: inline;"> This paper presents a dual-stream text-to-speech (TTS) model, SyncSpeech, capable of receiving streaming text input from upstream models while simultaneously generating streaming speech, facilitating seamless interaction with large language models. SyncSpeech has the following advantages: Low latency, as it begins generating streaming speech upon receiving the second text token; High efficiency, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11094v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11094v1-abstract-full" style="display: none;"> This paper presents a dual-stream text-to-speech (TTS) model, SyncSpeech, capable of receiving streaming text input from upstream models while simultaneously generating streaming speech, facilitating seamless interaction with large language models. SyncSpeech has the following advantages: Low latency, as it begins generating streaming speech upon receiving the second text token; High efficiency, as it decodes all speech tokens corresponding to the each arrived text token in one step. To achieve this, we propose a temporal masked transformer as the backbone of SyncSpeech, combined with token-level duration prediction to predict speech tokens and the duration for the next step. Additionally, we design a two-stage training strategy to improve training efficiency and the quality of generated speech. We evaluated the SyncSpeech on both English and Mandarin datasets. Compared to the recent dual-stream TTS models, SyncSpeech significantly reduces the first packet delay of speech tokens and accelerates the real-time factor. Moreover, with the same data scale, SyncSpeech achieves performance comparable to that of traditional autoregressive-based TTS models in terms of both speech quality and robustness. Speech samples are available at https://SyncSpeech.github.io/}{https://SyncSpeech.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11094v1-abstract-full').style.display = 'none'; document.getElementById('2502.11094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11023">arXiv:2502.11023</a> <span> [<a href="https://arxiv.org/pdf/2502.11023">pdf</a>, <a href="https://arxiv.org/format/2502.11023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DT4ECG: A Dual-Task Learning Framework for ECG-Based Human Identity Recognition and Human Activity Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+S">Siyu You</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+B">Boyuan Gu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanhui Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shiyu Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shisheng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11023v1-abstract-short" style="display: inline;"> This article introduces DT4ECG, an innovative dual-task learning framework for Electrocardiogram (ECG)-based human identity recognition and activity detection. The framework employs a robust one-dimensional convolutional neural network (1D-CNN) backbone integrated with residual blocks to extract discriminative ECG features. To enhance feature representation, we propose a novel Sequence Channel Att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11023v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11023v1-abstract-full" style="display: none;"> This article introduces DT4ECG, an innovative dual-task learning framework for Electrocardiogram (ECG)-based human identity recognition and activity detection. The framework employs a robust one-dimensional convolutional neural network (1D-CNN) backbone integrated with residual blocks to extract discriminative ECG features. To enhance feature representation, we propose a novel Sequence Channel Attention (SCA) mechanism, which combines channel-wise and sequential context attention to prioritize informative features across both temporal and channel dimensions. Furthermore, to address gradient imbalance in multi-task learning, we integrate GradNorm, a technique that dynamically adjusts loss weights based on gradient magnitudes, ensuring balanced training across tasks. Experimental results demonstrate the superior performance of our model, achieving accuracy rates of 99.12% in ID classification and 90.11% in activity classification. These findings underscore the potential of the DT4ECG framework in enhancing security and user experience across various applications such as fitness monitoring and personalized healthcare, thereby presenting a transformative approach to integrating ECG-based biometrics in everyday technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11023v1-abstract-full').style.display = 'none'; document.getElementById('2502.11023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10990">arXiv:2502.10990</a> <span> [<a href="https://arxiv.org/pdf/2502.10990">pdf</a>, <a href="https://arxiv.org/format/2502.10990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> FinMTEB: Finance Massive Text Embedding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yixuan Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10990v1-abstract-short" style="display: inline;"> Embedding models play a crucial role in representing and retrieving information across various NLP applications. Recent advances in large language models (LLMs) have further enhanced the performance of embedding models. While these models are often benchmarked on general-purpose datasets, real-world applications demand domain-specific evaluation. In this work, we introduce the Finance Massive Text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10990v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10990v1-abstract-full" style="display: none;"> Embedding models play a crucial role in representing and retrieving information across various NLP applications. Recent advances in large language models (LLMs) have further enhanced the performance of embedding models. While these models are often benchmarked on general-purpose datasets, real-world applications demand domain-specific evaluation. In this work, we introduce the Finance Massive Text Embedding Benchmark (FinMTEB), a specialized counterpart to MTEB designed for the financial domain. FinMTEB comprises 64 financial domain-specific embedding datasets across 7 tasks that cover diverse textual types in both Chinese and English, such as financial news articles, corporate annual reports, ESG reports, regulatory filings, and earnings call transcripts. We also develop a finance-adapted model, FinPersona-E5, using a persona-based data synthetic method to cover diverse financial embedding tasks for training. Through extensive evaluation of 15 embedding models, including FinPersona-E5, we show three key findings: (1) performance on general-purpose benchmarks shows limited correlation with financial domain tasks; (2) domain-adapted models consistently outperform their general-purpose counterparts; and (3) surprisingly, a simple Bag-of-Words (BoW) approach outperforms sophisticated dense embeddings in financial Semantic Textual Similarity (STS) tasks, underscoring current limitations in dense embedding techniques. Our work establishes a robust evaluation framework for financial NLP applications and provides crucial insights for developing domain-specific embedding models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10990v1-abstract-full').style.display = 'none'; document.getElementById('2502.10990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/yixuantt/FinMTEB</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10696">arXiv:2502.10696</a> <span> [<a href="https://arxiv.org/pdf/2502.10696">pdf</a>, <a href="https://arxiv.org/format/2502.10696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Improving Retrieval-Augmented Deep Assertion Generation via Joint Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Quanjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yi Zheng</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Ruixiang Qian</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shengcheng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yun Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tao Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10696v2-abstract-short" style="display: inline;"> Unit testing attempts to validate the correctness of basic units of the software system under test and has a crucial role in software development and testing. Very recent work proposes a retrieve-and-edit approach to generate unit test oracles, i.e., assertions. Despite being promising, it is still far from perfect due to some limitations, such as splitting assertion retrieval and generation into… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10696v2-abstract-full').style.display = 'inline'; document.getElementById('2502.10696v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10696v2-abstract-full" style="display: none;"> Unit testing attempts to validate the correctness of basic units of the software system under test and has a crucial role in software development and testing. Very recent work proposes a retrieve-and-edit approach to generate unit test oracles, i.e., assertions. Despite being promising, it is still far from perfect due to some limitations, such as splitting assertion retrieval and generation into two separate components without benefiting each other. In this paper, we propose AG-RAG, a retrieval-augmented automated assertion generation approach that leverages external codebases and joint training to address various technical limitations of prior work. Inspired by the plastic surgery hypothesis, AG-RAG attempts to combine relevant unit tests and advanced pre-trained language models (PLMs) with retrieval-augmented fine-tuning. AG-RAG builds a dense retriever to search for relevant test-assert pairs (TAPs) with semantic matching and a retrieval-augmented generator to synthesize accurate assertions with the focal-test and retrieved TAPs as input. Besides, AG-RAG leverages a code-aware language model CodeT5 as the cornerstone to facilitate both assertion retrieval and generation tasks. Furthermore, the retriever is optimized in conjunction with the generator as a whole pipeline with a joint training strategy. This unified design fully adapts both components specifically for retrieving more useful TAPs, thereby generating accurate assertions. We extensively evaluate AG-RAG against six state-of-the-art AG approaches on two benchmarks and three metrics. Experimental results show that AG-RAG significantly outperforms previous AG approaches on all benchmarks and metrics, e.g., improving the most recent baseline EditAS by 20.82% and 26.98% in terms of accuracy. AG-RAG also correctly generates 1739 and 2866 unique assertions that all baselines fail to generate, 3.45X and 9.20X more than EditAS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10696v2-abstract-full').style.display = 'none'; document.getElementById('2502.10696v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Transactions on Software Engineering (TSE 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10639">arXiv:2502.10639</a> <span> [<a href="https://arxiv.org/pdf/2502.10639">pdf</a>, <a href="https://arxiv.org/format/2502.10639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LSTM-based Selective Dense Text Retrieval Guided by Sparse Lexical Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yingrui Yang</a>, <a href="/search/cs?searchtype=author&query=Carlson%2C+P">Parker Carlson</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yifan Qiao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wentai Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shanxiu He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10639v1-abstract-short" style="display: inline;"> This paper studies fast fusion of dense retrieval and sparse lexical retrieval, and proposes a cluster-based selective dense retrieval method called CluSD guided by sparse lexical retrieval. CluSD takes a lightweight cluster-based approach and exploits the overlap of sparse retrieval results and embedding clusters in a two-stage selection process with an LSTM model to quickly identify relevant clu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10639v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10639v1-abstract-full" style="display: none;"> This paper studies fast fusion of dense retrieval and sparse lexical retrieval, and proposes a cluster-based selective dense retrieval method called CluSD guided by sparse lexical retrieval. CluSD takes a lightweight cluster-based approach and exploits the overlap of sparse retrieval results and embedding clusters in a two-stage selection process with an LSTM model to quickly identify relevant clusters while incurring limited extra memory space overhead. CluSD triggers partial dense retrieval and performs cluster-based block disk I/O if needed. This paper evaluates CluSD and compares it with several baselines for searching in-memory and on-disk MS MARCO and BEIR datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10639v1-abstract-full').style.display = 'none'; document.getElementById('2502.10639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by ECIR'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10389">arXiv:2502.10389</a> <span> [<a href="https://arxiv.org/pdf/2502.10389">pdf</a>, <a href="https://arxiv.org/format/2502.10389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Region-Adaptive Sampling for Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziming Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengruidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lili Qiu</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yang You</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10389v1-abstract-short" style="display: inline;"> Diffusion models (DMs) have become the leading choice for generative tasks across diverse domains. However, their reliance on multiple sequential forward passes significantly limits real-time performance. Previous acceleration methods have primarily focused on reducing the number of sampling steps or reusing intermediate results, failing to leverage variations across spatial regions within the ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10389v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10389v1-abstract-full" style="display: none;"> Diffusion models (DMs) have become the leading choice for generative tasks across diverse domains. However, their reliance on multiple sequential forward passes significantly limits real-time performance. Previous acceleration methods have primarily focused on reducing the number of sampling steps or reusing intermediate results, failing to leverage variations across spatial regions within the image due to the constraints of convolutional U-Net structures. By harnessing the flexibility of Diffusion Transformers (DiTs) in handling variable number of tokens, we introduce RAS, a novel, training-free sampling strategy that dynamically assigns different sampling ratios to regions within an image based on the focus of the DiT model. Our key observation is that during each sampling step, the model concentrates on semantically meaningful regions, and these areas of focus exhibit strong continuity across consecutive steps. Leveraging this insight, RAS updates only the regions currently in focus, while other regions are updated using cached noise from the previous step. The model's focus is determined based on the output from the preceding step, capitalizing on the temporal consistency we observed. We evaluate RAS on Stable Diffusion 3 and Lumina-Next-T2I, achieving speedups up to 2.36x and 2.51x, respectively, with minimal degradation in generation quality. Additionally, a user study reveals that RAS delivers comparable qualities under human evaluation while achieving a 1.6x speedup. Our approach makes a significant step towards more efficient diffusion transformers, enhancing their potential for real-time applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10389v1-abstract-full').style.display = 'none'; document.getElementById('2502.10389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository