CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,446 results for author: <span class="mathjax">Chen, W</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15218">arXiv:2502.15218</a> <span> [<a href="https://arxiv.org/pdf/2502.15218">pdf</a>, <a href="https://arxiv.org/format/2502.15218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ESPnet-SpeechLM: An Open Speech Language Model Toolkit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Masuyama%2C+Y">Yoshiki Masuyama</a>, <a href="/search/cs?searchtype=author&query=Maekaku%2C+T">Takashi Maekaku</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Junyi Peng</a>, <a href="/search/cs?searchtype=author&query=Bharadwaj%2C+S">Shikhar Bharadwaj</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiwen Zhao</a>, <a href="/search/cs?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15218v1-abstract-short" style="display: inline;"> We present ESPnet-SpeechLM, an open toolkit designed to democratize the development of speech language models (SpeechLMs) and voice-driven agentic applications. The toolkit standardizes speech processing tasks by framing them as universal sequential modeling problems, encompassing a cohesive workflow of data preprocessing, pre-training, inference, and task evaluation. With ESPnet-SpeechLM, users c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15218v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15218v1-abstract-full" style="display: none;"> We present ESPnet-SpeechLM, an open toolkit designed to democratize the development of speech language models (SpeechLMs) and voice-driven agentic applications. The toolkit standardizes speech processing tasks by framing them as universal sequential modeling problems, encompassing a cohesive workflow of data preprocessing, pre-training, inference, and task evaluation. With ESPnet-SpeechLM, users can easily define task templates and configure key settings, enabling seamless and streamlined SpeechLM development. The toolkit ensures flexibility, efficiency, and scalability by offering highly configurable modules for every stage of the workflow. To illustrate its capabilities, we provide multiple use cases demonstrating how competitive SpeechLMs can be constructed with ESPnet-SpeechLM, including a 1.7B-parameter model pre-trained on both text and speech tasks, across diverse benchmarks. The toolkit and its recipes are fully transparent and reproducible at: https://github.com/espnet/espnet/tree/speechlm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15218v1-abstract-full').style.display = 'none'; document.getElementById('2502.15218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14356">arXiv:2502.14356</a> <span> [<a href="https://arxiv.org/pdf/2502.14356">pdf</a>, <a href="https://arxiv.org/format/2502.14356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Full-Step-DPO: Self-Supervised Preference Optimization with Step-wise Rewards for Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huimin Xu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xin Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feng-Lin Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaobao Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Luu%2C+A+T">Anh Tuan Luu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14356v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) often struggles with long-chain mathematical reasoning. Existing approaches, such as Step-DPO, typically improve this by focusing on the first erroneous step in the reasoning chain. However, they overlook all other steps and rely heavily on humans or GPT-4 to identify erroneous steps. To address these issues, we propose Full-Step-DPO, a novel DPO framework tail… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14356v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14356v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) often struggles with long-chain mathematical reasoning. Existing approaches, such as Step-DPO, typically improve this by focusing on the first erroneous step in the reasoning chain. However, they overlook all other steps and rely heavily on humans or GPT-4 to identify erroneous steps. To address these issues, we propose Full-Step-DPO, a novel DPO framework tailored for mathematical reasoning. Instead of optimizing only the first erroneous step, it leverages step-wise rewards from the entire reasoning chain. This is achieved by training a self-supervised process reward model, which automatically scores each step, providing rewards while avoiding reliance on external signals. Furthermore, we introduce a novel step-wise DPO loss, which dynamically updates gradients based on these step-wise rewards. This endows stronger reasoning capabilities to language models. Extensive evaluations on both in-domain and out-of-domain mathematical reasoning benchmarks across various base language models, demonstrate that Full-Step-DPO achieves superior performance compared to state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14356v1-abstract-full').style.display = 'none'; document.getElementById('2502.14356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13539">arXiv:2502.13539</a> <span> [<a href="https://arxiv.org/pdf/2502.13539">pdf</a>, <a href="https://arxiv.org/format/2502.13539">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Bursting Filter Bubble: Enhancing Serendipity Recommendations with Aligned Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xi%2C+Y">Yunjia Xi</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+M">Muyan Weng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Chao Yi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dian Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Gaoyang Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuning Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13539v1-abstract-short" style="display: inline;"> Recommender systems (RSs) often suffer from the feedback loop phenomenon, e.g., RSs are trained on data biased by their recommendations. This leads to the filter bubble effect that reinforces homogeneous content and reduces user satisfaction. To this end, serendipity recommendations, which offer unexpected yet relevant items, are proposed. Recently, large language models (LLMs) have shown potentia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13539v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13539v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13539v1-abstract-full" style="display: none;"> Recommender systems (RSs) often suffer from the feedback loop phenomenon, e.g., RSs are trained on data biased by their recommendations. This leads to the filter bubble effect that reinforces homogeneous content and reduces user satisfaction. To this end, serendipity recommendations, which offer unexpected yet relevant items, are proposed. Recently, large language models (LLMs) have shown potential in serendipity prediction due to their extensive world knowledge and reasoning capabilities. However, they still face challenges in aligning serendipity judgments with human assessments, handling long user behavior sequences, and meeting the latency requirements of industrial RSs. To address these issues, we propose SERAL (Serendipity Recommendations with Aligned Large Language Models), a framework comprising three stages: (1) Cognition Profile Generation to compress user behavior into multi-level profiles; (2) SerenGPT Alignment to align serendipity judgments with human preferences using enriched training data; and (3) Nearline Adaptation to integrate SerenGPT into industrial RSs pipelines efficiently. Online experiments demonstrate that SERAL improves exposure ratio (PVR), clicks, and transactions of serendipitous items by 5.7%, 29.56%, and 27.6%, enhancing user experience without much impact on overall revenue. Now, it has been fully deployed in the "Guess What You Like" of the Taobao App homepage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13539v1-abstract-full').style.display = 'none'; document.getElementById('2502.13539v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13530">arXiv:2502.13530</a> <span> [<a href="https://arxiv.org/pdf/2502.13530">pdf</a>, <a href="https://arxiv.org/format/2502.13530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Breaking the Clusters: Uniformity-Optimization for Text-Based Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Min Gao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junhao Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13530v1-abstract-short" style="display: inline;"> Traditional sequential recommendation (SR) methods heavily rely on explicit item IDs to capture user preferences over time. This reliance introduces critical limitations in cold-start scenarios and domain transfer tasks, where unseen items and new contexts often lack established ID mappings. To overcome these limitations, recent studies have shifted towards leveraging text-only information for rec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13530v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13530v1-abstract-full" style="display: none;"> Traditional sequential recommendation (SR) methods heavily rely on explicit item IDs to capture user preferences over time. This reliance introduces critical limitations in cold-start scenarios and domain transfer tasks, where unseen items and new contexts often lack established ID mappings. To overcome these limitations, recent studies have shifted towards leveraging text-only information for recommendation, thereby improving model generalization and adaptability across domains. Although promising, text-based SR faces unique difficulties: items' text descriptions often share semantic similarities that lead to clustered item representations, compromising their uniformity, a property essential for promoting diversity and enhancing generalization in recommendation systems. In this paper, we explore a novel framework to improve the uniformity of item representations in text-based SR. Our analysis reveals that items within a sequence exhibit marked semantic similarity, meaning they are closer in representation than items overall, and that this effect is more pronounced for less popular items, which form tighter clusters compared to their more popular counterparts. Based on these findings, we propose UniT, a framework that employs three pairwise item sampling strategies: Unified General Sampling Strategy, Sequence-Driven Sampling Strategy, and Popularity-Driven Sampling Strategy. Each strategy applies varying degrees of repulsion to selectively adjust the distances between item pairs, thereby refining representation uniformity while considering both sequence context and item popularity. Extensive experiments on multiple real-world datasets demonstrate that our proposed approach outperforms state-of-the-art models, validating the effectiveness of UniT in enhancing both representation uniformity and recommendation accuracy.The source code is available at https://github.com/ccwwhhh/Model-Rec. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13530v1-abstract-full').style.display = 'none'; document.getElementById('2502.13530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13467">arXiv:2502.13467</a> <span> [<a href="https://arxiv.org/pdf/2502.13467">pdf</a>, <a href="https://arxiv.org/ps/2502.13467">ps</a>, <a href="https://arxiv.org/format/2502.13467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continuous K-Max Bandits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siwei Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Longbo Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13467v1-abstract-short" style="display: inline;"> We study the $K$-Max combinatorial multi-armed bandits problem with continuous outcome distributions and weak value-index feedback: each base arm has an unknown continuous outcome distribution, and in each round the learning agent selects $K$ arms, obtains the maximum value sampled from these $K$ arms as reward and observes this reward together with the corresponding arm index as feedback. This se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13467v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13467v1-abstract-full" style="display: none;"> We study the $K$-Max combinatorial multi-armed bandits problem with continuous outcome distributions and weak value-index feedback: each base arm has an unknown continuous outcome distribution, and in each round the learning agent selects $K$ arms, obtains the maximum value sampled from these $K$ arms as reward and observes this reward together with the corresponding arm index as feedback. This setting captures critical applications in recommendation systems, distributed computing, server scheduling, etc. The continuous $K$-Max bandits introduce unique challenges, including discretization error from continuous-to-discrete conversion, non-deterministic tie-breaking under limited feedback, and biased estimation due to partial observability. Our key contribution is the computationally efficient algorithm DCK-UCB, which combines adaptive discretization with bias-corrected confidence bounds to tackle these challenges. For general continuous distributions, we prove that DCK-UCB achieves a $\widetilde{\mathcal{O}}(T^{3/4})$ regret upper bound, establishing the first sublinear regret guarantee for this setting. Furthermore, we identify an important special case with exponential distributions under full-bandit feedback. In this case, our proposed algorithm MLE-Exp enables $\widetilde{\mathcal{O}}(\sqrt{T})$ regret upper bound through maximal log-likelihood estimation, achieving near-minimax optimality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13467v1-abstract-full').style.display = 'none'; document.getElementById('2502.13467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12671">arXiv:2502.12671</a> <span> [<a href="https://arxiv.org/pdf/2502.12671">pdf</a>, <a href="https://arxiv.org/format/2502.12671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-M1: Pushing the Medical Capability of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haizhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huozhi Zhou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Liang Song</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingyu Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wei Cheng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiangrong Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yupeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zecheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyun Zhao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+F">Fei Kou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fuzhong Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Han Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongda Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jin He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinjie Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kangxi Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kegeng Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+L">Lei Su</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+L">Linlin Niu</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12671v1-abstract-short" style="display: inline;"> The current generation of large language models (LLMs) is typically designed for broad, general-purpose applications, while domain-specific LLMs, especially in vertical fields like medicine, remain relatively scarce. In particular, the development of highly efficient and practical LLMs for the medical domain is challenging due to the complexity of medical knowledge and the limited availability of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12671v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12671v1-abstract-full" style="display: none;"> The current generation of large language models (LLMs) is typically designed for broad, general-purpose applications, while domain-specific LLMs, especially in vertical fields like medicine, remain relatively scarce. In particular, the development of highly efficient and practical LLMs for the medical domain is challenging due to the complexity of medical knowledge and the limited availability of high-quality data. To bridge this gap, we introduce Baichuan-M1, a series of large language models specifically optimized for medical applications. Unlike traditional approaches that simply continue pretraining on existing models or apply post-training to a general base model, Baichuan-M1 is trained from scratch with a dedicated focus on enhancing medical capabilities. Our model is trained on 20 trillion tokens and incorporates a range of effective training methods that strike a balance between general capabilities and medical expertise. As a result, Baichuan-M1 not only performs strongly across general domains such as mathematics and coding but also excels in specialized medical fields. We have open-sourced Baichuan-M1-14B, a mini version of our model, which can be accessed through the following links. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12671v1-abstract-full').style.display = 'none'; document.getElementById('2502.12671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12658">arXiv:2502.12658</a> <span> [<a href="https://arxiv.org/pdf/2502.12658">pdf</a>, <a href="https://arxiv.org/format/2502.12658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> R.R.: Unveiling LLM Training Privacy through Recollection and Ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+W">Wenlong Meng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhenyuan Guo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lenan Wu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chen Gong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weixian Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chengkun Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenzhi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12658v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) pose significant privacy risks, potentially leaking training data due to implicit memorization. Existing privacy attacks primarily focus on membership inference attacks (MIAs) or data extraction attacks, but reconstructing specific personally identifiable information (PII) in LLM's training data remains challenging. In this paper, we propose R.R. (Recollect and Rank),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12658v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12658v1-abstract-full" style="display: none;"> Large Language Models (LLMs) pose significant privacy risks, potentially leaking training data due to implicit memorization. Existing privacy attacks primarily focus on membership inference attacks (MIAs) or data extraction attacks, but reconstructing specific personally identifiable information (PII) in LLM's training data remains challenging. In this paper, we propose R.R. (Recollect and Rank), a novel two-step privacy stealing attack that enables attackers to reconstruct PII entities from scrubbed training data where the PII entities have been masked. In the first stage, we introduce a prompt paradigm named recollection, which instructs the LLM to repeat a masked text but fill in masks. Then we can use PII identifiers to extract recollected PII candidates. In the second stage, we design a new criterion to score each PII candidate and rank them. Motivated by membership inference, we leverage the reference model as a calibration to our criterion. Experiments across three popular PII datasets demonstrate that the R.R. achieves better PII identical performance compared to baselines. These results highlight the vulnerability of LLMs to PII leakage even when training data has been scrubbed. We release the replicate package of R.R. at a link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12658v1-abstract-full').style.display = 'none'; document.getElementById('2502.12658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12355">arXiv:2502.12355</a> <span> [<a href="https://arxiv.org/pdf/2502.12355">pdf</a>, <a href="https://arxiv.org/format/2502.12355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Hovering Flight of Soft-Actuated Insect-Scale Micro Aerial Vehicles using Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsiao%2C+Y">Yi-Hsuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Tung Chen</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yun-Sheng Chang</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+P">Pulkit Agrawal</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">YuFeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12355v1-abstract-short" style="display: inline;"> Soft-actuated insect-scale micro aerial vehicles (IMAVs) pose unique challenges for designing robust and computationally efficient controllers. At the millimeter scale, fast robot dynamics ($\sim$ms), together with system delay, model uncertainty, and external disturbances significantly affect flight performances. Here, we design a deep reinforcement learning (RL) controller that addresses system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12355v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12355v1-abstract-full" style="display: none;"> Soft-actuated insect-scale micro aerial vehicles (IMAVs) pose unique challenges for designing robust and computationally efficient controllers. At the millimeter scale, fast robot dynamics ($\sim$ms), together with system delay, model uncertainty, and external disturbances significantly affect flight performances. Here, we design a deep reinforcement learning (RL) controller that addresses system delay and uncertainties. To initialize this neural network (NN) controller, we propose a modified behavior cloning (BC) approach with state-action re-matching to account for delay and domain-randomized expert demonstration to tackle uncertainty. Then we apply proximal policy optimization (PPO) to fine-tune the policy during RL, enhancing performance and smoothing commands. In simulations, our modified BC substantially increases the mean reward compared to baseline BC; and RL with PPO improves flight quality and reduces command fluctuations. We deploy this controller on two different insect-scale aerial robots that weigh 720 mg and 850 mg, respectively. The robots demonstrate multiple successful zero-shot hovering flights, with the longest lasting 50 seconds and root-mean-square errors of 1.34 cm in lateral direction and 0.05 cm in altitude, marking the first end-to-end deep RL-based flight on soft-driven IMAVs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12355v1-abstract-full').style.display = 'none'; document.getElementById('2502.12355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures, accepted to 2025 IEEE International Conference on Soft Robotics (RoboSoft)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12224">arXiv:2502.12224</a> <span> [<a href="https://arxiv.org/pdf/2502.12224">pdf</a>, <a href="https://arxiv.org/format/2502.12224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Accurate Expert Predictions in MoE Inference via Cross-Layer Gate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhiyuan Fang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zicong Hong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuegui Huang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yufeng Lyu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wuhui Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12224v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive performance across various tasks, and their application in edge scenarios has attracted significant attention. However, sparse-activated Mixture-of-Experts (MoE) models, which are well suited for edge scenarios, have received relatively little attention due to their high memory demands. Offload-based methods have been proposed to address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12224v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12224v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive performance across various tasks, and their application in edge scenarios has attracted significant attention. However, sparse-activated Mixture-of-Experts (MoE) models, which are well suited for edge scenarios, have received relatively little attention due to their high memory demands. Offload-based methods have been proposed to address this challenge, but they face difficulties with expert prediction. Inaccurate expert predictions can result in prolonged inference delays. To promote the application of MoE models in edge scenarios, we propose Fate, an offloading system designed for MoE models to enable efficient inference in resource-constrained environments. The key insight behind Fate is that gate inputs from adjacent layers can be effectively used for expert prefetching, achieving high prediction accuracy without additional GPU overhead. Furthermore, Fate employs a shallow-favoring expert caching strategy that increases the expert hit rate to 99\%. Additionally, Fate integrates tailored quantization strategies for cache optimization and IO efficiency. Experimental results show that, compared to Load on Demand and Expert Activation Path-based method, Fate achieves up to 4.5x and 1.9x speedups in prefill speed and up to 4.1x and 2.2x speedups in decoding speed, respectively, while maintaining inference quality. Moreover, Fate's performance improvements are scalable across different memory budgets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12224v1-abstract-full').style.display = 'none'; document.getElementById('2502.12224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11863">arXiv:2502.11863</a> <span> [<a href="https://arxiv.org/pdf/2502.11863">pdf</a>, <a href="https://arxiv.org/format/2502.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FedEAT: A Robustness Optimization Framework for Federated LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yahao Pang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xingyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaojin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11863v1-abstract-short" style="display: inline;"> Significant advancements have been made by Large Language Models (LLMs) in the domains of natural language understanding and automated content creation. However, they still face persistent problems, including substantial computational costs and inadequate availability of training data. The combination of Federated Learning (FL) and LLMs (federated LLMs) offers a solution by leveraging distributed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11863v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11863v1-abstract-full" style="display: none;"> Significant advancements have been made by Large Language Models (LLMs) in the domains of natural language understanding and automated content creation. However, they still face persistent problems, including substantial computational costs and inadequate availability of training data. The combination of Federated Learning (FL) and LLMs (federated LLMs) offers a solution by leveraging distributed data while protecting privacy, which positions it as an ideal choice for sensitive domains. However, Federated LLMs still suffer from robustness challenges, including data heterogeneity, malicious clients, and adversarial attacks, which greatly hinder their applications. We first introduce the robustness problems in federated LLMs, to address these challenges, we propose FedEAT (Federated Embedding space Adversarial Training), a novel framework that applies adversarial training in the embedding space of client LLM and employs a robust aggregation approach, specifically geometric median aggregation, to enhance the robustness of Federated LLMs. Our experiments demonstrate that FedEAT effectively improves the robustness of Federated LLMs with minimal performance loss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11863v1-abstract-full').style.display = 'none'; document.getElementById('2502.11863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11533">arXiv:2502.11533</a> <span> [<a href="https://arxiv.org/pdf/2502.11533">pdf</a>, <a href="https://arxiv.org/format/2502.11533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Be Cautious When Merging Unfamiliar LLMs: A Phishing Model Capable of Stealing Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhenyuan Guo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yi Shi</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+W">Wenlong Meng</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chen Gong</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chengkun Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenzhi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11533v1-abstract-short" style="display: inline;"> Model merging is a widespread technology in large language models (LLMs) that integrates multiple task-specific LLMs into a unified one, enabling the merged model to inherit the specialized capabilities of these LLMs. Most task-specific LLMs are sourced from open-source communities and have not undergone rigorous auditing, potentially imposing risks in model merging. This paper highlights an overl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11533v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11533v1-abstract-full" style="display: none;"> Model merging is a widespread technology in large language models (LLMs) that integrates multiple task-specific LLMs into a unified one, enabling the merged model to inherit the specialized capabilities of these LLMs. Most task-specific LLMs are sourced from open-source communities and have not undergone rigorous auditing, potentially imposing risks in model merging. This paper highlights an overlooked privacy risk: \textit{an unsafe model could compromise the privacy of other LLMs involved in the model merging.} Specifically, we propose PhiMM, a privacy attack approach that trains a phishing model capable of stealing privacy using a crafted privacy phishing instruction dataset. Furthermore, we introduce a novel model cloaking method that mimics a specialized capability to conceal attack intent, luring users into merging the phishing model. Once victims merge the phishing model, the attacker can extract personally identifiable information (PII) or infer membership information (MI) by querying the merged model with the phishing instruction. Experimental results show that merging a phishing model increases the risk of privacy breaches. Compared to the results before merging, PII leakage increased by 3.9\% and MI leakage increased by 17.4\% on average. We release the code of PhiMM through a link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11533v1-abstract-full').style.display = 'none'; document.getElementById('2502.11533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11407">arXiv:2502.11407</a> <span> [<a href="https://arxiv.org/pdf/2502.11407">pdf</a>, <a href="https://arxiv.org/format/2502.11407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Gensor: A Graph-based Construction Tensor Compilation Method for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hangda Liu</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+B">Boyu Diao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenxin Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaohui Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongjun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11407v1-abstract-short" style="display: inline;"> High-performance deep learning depends on efficient tensor programs. In recent years, automatic tensor program optimization, also known as tensor compilation, has emerged as the primary approach to generating efficient tensor programs. However, how to generate kernels with higher performance in a shorter time is still the key challenge. In this paper, we present Gensor, a graph-based construction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11407v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11407v1-abstract-full" style="display: none;"> High-performance deep learning depends on efficient tensor programs. In recent years, automatic tensor program optimization, also known as tensor compilation, has emerged as the primary approach to generating efficient tensor programs. However, how to generate kernels with higher performance in a shorter time is still the key challenge. In this paper, we present Gensor, a graph-based construction tensor compilation method for deep learning, to further improve the performance of construction tensor compilation. Unlike existing tree-based methods, Gensor abstracts construction space into a graph structure. Gensor then explores the construction space with Markov analysis. Gensor takes tensor programs as states and models scheduling primitives as transition actions between these states. Therefore, the process of tensor program construction optimization is abstracted as a graph traversal process. This approach expands the optimization space, improving operator performance while ensuring rapid optimization. Extensive experiments with typical operators demonstrate that Gensor significantly outperforms the state-of-the-art methods on GPUs for both cloud servers and edge devices. As a result, Gensor can generate operator kernels in seconds, with performance increasing by 18\% on average, reaching a maximum of 30\%. It also achieves high speedup for end-to-end models like ResNet-50 and GPT-2, with an average acceleration of 20\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11407v1-abstract-full').style.display = 'none'; document.getElementById('2502.11407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11211">arXiv:2502.11211</a> <span> [<a href="https://arxiv.org/pdf/2502.11211">pdf</a>, <a href="https://arxiv.org/format/2502.11211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey of LLM-based Agents in Medicine: How far are we from Baymax? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zizhan Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenghan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11211v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are transforming healthcare through the development of LLM-based agents that can understand, reason about, and assist with medical tasks. This survey provides a comprehensive review of LLM-based agents in medicine, examining their architectures, applications, and challenges. We analyze the key components of medical agent systems, including system profiles, clinical pla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11211v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11211v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are transforming healthcare through the development of LLM-based agents that can understand, reason about, and assist with medical tasks. This survey provides a comprehensive review of LLM-based agents in medicine, examining their architectures, applications, and challenges. We analyze the key components of medical agent systems, including system profiles, clinical planning mechanisms, medical reasoning frameworks, and external capacity enhancement. The survey covers major application scenarios such as clinical decision support, medical documentation, training simulations, and healthcare service optimization. We discuss evaluation frameworks and metrics used to assess these agents' performance in healthcare settings. While LLM-based agents show promise in enhancing healthcare delivery, several challenges remain, including hallucination management, multimodal integration, implementation barriers, and ethical considerations. The survey concludes by highlighting future research directions, including advances in medical reasoning inspired by recent developments in LLM architectures, integration with physical systems, and improvements in training simulations. This work provides researchers and practitioners with a structured overview of the current state and future prospects of LLM-based agents in medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11211v1-abstract-full').style.display = 'none'; document.getElementById('2502.11211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10803">arXiv:2502.10803</a> <span> [<a href="https://arxiv.org/pdf/2502.10803">pdf</a>, <a href="https://arxiv.org/format/2502.10803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PDA: Generalizable Detection of AI-Generated Images via Post-hoc Distribution Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shanqing Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10803v1-abstract-short" style="display: inline;"> The rapid advancement of generative models has led to the proliferation of highly realistic AI-generated images, posing significant challenges for detection methods to generalize across diverse and evolving generative techniques. Existing approaches often fail to adapt to unknown models without costly retraining, limiting their practicability. To fill this gap, we propose Post-hoc Distribution Ali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10803v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10803v1-abstract-full" style="display: none;"> The rapid advancement of generative models has led to the proliferation of highly realistic AI-generated images, posing significant challenges for detection methods to generalize across diverse and evolving generative techniques. Existing approaches often fail to adapt to unknown models without costly retraining, limiting their practicability. To fill this gap, we propose Post-hoc Distribution Alignment (PDA), a novel approach for the generalizable detection for AI-generated images. The key idea is to use the known generative model to regenerate undifferentiated test images. This process aligns the distributions of the re-generated real images with the known fake images, enabling effective distinction from unknown fake images. PDA employs a two-step detection framework: 1) evaluating whether a test image aligns with the known fake distribution based on deep k-nearest neighbor (KNN) distance, and 2) re-generating test images using known generative models to create pseudo-fake images for further classification. This alignment strategy allows PDA to effectively detect fake images without relying on unseen data or requiring retraining. Extensive experiments demonstrate the superiority of PDA, achieving 96.73\% average accuracy across six state-of-the-art generative models, including GANs, diffusion models, and text-to-image models, and improving by 16.07\% over the best baseline. Through t-SNE visualizations and KNN distance analysis, we provide insights into PDA's effectiveness in separating real and fake images. Our work provides a flexible and effective solution for real-world fake image detection, advancing the generalization ability of detection systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10803v1-abstract-full').style.display = 'none'; document.getElementById('2502.10803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10373">arXiv:2502.10373</a> <span> [<a href="https://arxiv.org/pdf/2502.10373">pdf</a>, <a href="https://arxiv.org/format/2502.10373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OWLS: Scaling Laws for Multilingual Speech Recognition and Translation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10373v1-abstract-short" style="display: inline;"> Neural scaling laws offer valuable insights for designing robust sequence processing architectures. While these laws have been extensively characterized in other modalities, their behavior in speech remains comparatively underexplored. In this work, we introduce OWLS, an open-access, reproducible suite of multilingual speech recognition and translation models spanning 0.25B to 18B parameters, with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10373v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10373v1-abstract-full" style="display: none;"> Neural scaling laws offer valuable insights for designing robust sequence processing architectures. While these laws have been extensively characterized in other modalities, their behavior in speech remains comparatively underexplored. In this work, we introduce OWLS, an open-access, reproducible suite of multilingual speech recognition and translation models spanning 0.25B to 18B parameters, with the 18B version being the largest speech model, to the best of our knowledge. OWLS leverages up to 360K hours of public speech data across 150 languages, enabling a systematic investigation into how data, model, and compute scaling each influence performance in multilingual speech tasks. We use OWLS to derive neural scaling laws, showing how final performance can be reliably predicted when scaling. One of our key findings is that scaling enhances performance on low-resource languages/dialects, helping to mitigate bias and improve the accessibility of speech technologies. Finally, we show how OWLS can be used to power new research directions by discovering emergent abilities in large-scale speech models. Model checkpoints will be released on https://huggingface.co/collections/espnet/owls-scaling-laws-for-speech-recognition-and-translation-67ab7f991c194065f057ce8d for future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10373v1-abstract-full').style.display = 'none'; document.getElementById('2502.10373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09940">arXiv:2502.09940</a> <span> [<a href="https://arxiv.org/pdf/2502.09940">pdf</a>, <a href="https://arxiv.org/format/2502.09940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Preliminary Exploration with GPT-4o Voice Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09940v1-abstract-short" style="display: inline;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command clas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09940v1-abstract-full" style="display: none;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command classification, semantic and grammatical reasoning., multilingual speech recognition, and singing analysis. It also shows greater robustness against hallucinations than other large audio-language models (LALMs). However, it struggles with tasks such as audio duration prediction and instrument classification. Additionally, GPT-4o's safety mechanisms cause it to decline tasks like speaker identification, age classification, MOS prediction, and audio deepfake detection. Notably, the model exhibits a significantly different refusal rate when responding to speaker verification tasks on different datasets. This is likely due to variations in the accompanying instructions or the quality of the input audio, suggesting the sensitivity of its built-in safeguards. Finally, we acknowledge that model performance varies with evaluation protocols. This report only serves as a preliminary exploration of the current state of LALMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'none'; document.getElementById('2502.09940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08943">arXiv:2502.08943</a> <span> [<a href="https://arxiv.org/pdf/2502.08943">pdf</a>, <a href="https://arxiv.org/format/2502.08943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond the Singular: The Essential Role of Multiple Generations in Effective Benchmark Evaluation and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hengrui Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08943v2-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated significant utilities in real-world applications, exhibiting impressive capabilities in natural language processing and understanding. Benchmark evaluations are crucial for assessing the capabilities of LLMs as they can provide a comprehensive assessment of their strengths and weaknesses. However, current evaluation methods often overlook the inherent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08943v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08943v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08943v2-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated significant utilities in real-world applications, exhibiting impressive capabilities in natural language processing and understanding. Benchmark evaluations are crucial for assessing the capabilities of LLMs as they can provide a comprehensive assessment of their strengths and weaknesses. However, current evaluation methods often overlook the inherent randomness of LLMs by employing deterministic generation strategies or relying on a single random sample, resulting in unaccounted sampling variance and unreliable benchmark score estimates. In this paper, we propose a hierarchical statistical model that provides a more comprehensive representation of the benchmarking process by incorporating both benchmark characteristics and LLM randomness. We show that leveraging multiple generations improves the accuracy of estimating the benchmark score and reduces variance. We also introduce $\mathbb P\left(\text{correct}\right)$, a prompt-level difficulty score based on correct ratios, providing fine-grained insights into individual prompts. Additionally, we create a data map that visualizes difficulty and semantic prompts, enabling error detection and quality control in benchmark construction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08943v2-abstract-full').style.display = 'none'; document.getElementById('2502.08943v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 1 table, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08736">arXiv:2502.08736</a> <span> [<a href="https://arxiv.org/pdf/2502.08736">pdf</a>, <a href="https://arxiv.org/format/2502.08736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Recurrent Memory for Online Interdomain Gaussian Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenlong Chen</a>, <a href="/search/cs?searchtype=author&query=Kiyohara%2C+N">Naoki Kiyohara</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H+B+H">Harrison Bo Hua Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingzhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08736v1-abstract-short" style="display: inline;"> We propose a novel online Gaussian process (GP) model that is capable of capturing long-term memory in sequential data in an online regression setting. Our model, Online HiPPO Sparse Variational Gaussian Process Regression (OHSGPR), leverages the HiPPO (High-order Polynomial Projection Operators) framework, which is popularized in the RNN domain due to its long-range memory modeling capabilities.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08736v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08736v1-abstract-full" style="display: none;"> We propose a novel online Gaussian process (GP) model that is capable of capturing long-term memory in sequential data in an online regression setting. Our model, Online HiPPO Sparse Variational Gaussian Process Regression (OHSGPR), leverages the HiPPO (High-order Polynomial Projection Operators) framework, which is popularized in the RNN domain due to its long-range memory modeling capabilities. We interpret the HiPPO time-varying orthogonal projections as inducing variables with time-dependent orthogonal polynomial basis functions, which allows the SGPR inducing points to memorize the process history. We show that the HiPPO framework fits naturally into the interdomain GP framework and demonstrate that the kernel matrices can also be updated online in a recurrence form based on the ODE evolution of HiPPO. We evaluate our method on time series regression tasks, showing that it outperforms the existing online GP method in terms of predictive performance and computational efficiency <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08736v1-abstract-full').style.display = 'none'; document.getElementById('2502.08736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08005">arXiv:2502.08005</a> <span> [<a href="https://arxiv.org/pdf/2502.08005">pdf</a>, <a href="https://arxiv.org/format/2502.08005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Training One-Step Diffusion Models Without Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingtian Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiajun He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenlin Chen</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+Z">Zijing Ou</a>, <a href="/search/cs?searchtype=author&query=Hern%C3%A1ndez-Lobato%2C+J+M">Jos茅 Miguel Hern谩ndez-Lobato</a>, <a href="/search/cs?searchtype=author&query=Sch%C3%B6lkopf%2C+B">Bernhard Sch枚lkopf</a>, <a href="/search/cs?searchtype=author&query=Barber%2C+D">David Barber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08005v1-abstract-short" style="display: inline;"> Recent advances in one-step generative models typically follow a two-stage process: first training a teacher diffusion model and then distilling it into a one-step student model. This distillation process traditionally relies on both the teacher model's score function to compute the distillation loss and its weights for student initialization. In this paper, we explore whether one-step generative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08005v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08005v1-abstract-full" style="display: none;"> Recent advances in one-step generative models typically follow a two-stage process: first training a teacher diffusion model and then distilling it into a one-step student model. This distillation process traditionally relies on both the teacher model's score function to compute the distillation loss and its weights for student initialization. In this paper, we explore whether one-step generative models can be trained directly without this distillation process. First, we show that the teacher's score function is not essential and propose a family of distillation methods that achieve competitive results without relying on score estimation. Next, we demonstrate that initialization from teacher weights is indispensable in successful training. Surprisingly, we find that this benefit is not due to improved ``input-output" mapping but rather the learned feature representations, which dominate distillation quality. Our findings provide a better understanding of the role of initialization in one-step model training and its impact on distillation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08005v1-abstract-full').style.display = 'none'; document.getElementById('2502.08005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07822">arXiv:2502.07822</a> <span> [<a href="https://arxiv.org/pdf/2502.07822">pdf</a>, <a href="https://arxiv.org/format/2502.07822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PDM-SSD: Single-Stage Three-Dimensional Object Detector With Point Dilation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+A">Ao Liang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+H">Haiyang Hua</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jian Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaici Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07822v1-abstract-short" style="display: inline;"> Current Point-based detectors can only learn from the provided points, with limited receptive fields and insufficient global learning capabilities for such targets. In this paper, we present a novel Point Dilation Mechanism for single-stage 3D detection (PDM-SSD) that takes advantage of these two representations. Specifically, we first use a PointNet-style 3D backbone for efficient feature encodin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07822v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07822v1-abstract-full" style="display: none;"> Current Point-based detectors can only learn from the provided points, with limited receptive fields and insufficient global learning capabilities for such targets. In this paper, we present a novel Point Dilation Mechanism for single-stage 3D detection (PDM-SSD) that takes advantage of these two representations. Specifically, we first use a PointNet-style 3D backbone for efficient feature encoding. Then, a neck with Point Dilation Mechanism (PDM) is used to expand the feature space, which involves two key steps: point dilation and feature filling. The former expands points to a certain size grid centered around the sampled points in Euclidean space. The latter fills the unoccupied grid with feature for backpropagation using spherical harmonic coefficients and Gaussian density function in terms of direction and scale. Next, we associate multiple dilation centers and fuse coefficients to obtain sparse grid features through height compression. Finally, we design a hybrid detection head for joint learning, where on one hand, the scene heatmap is predicted to complement the voting point set for improved detection accuracy, and on the other hand, the target probability of detected boxes are calibrated through feature fusion. On the challenging Karlsruhe Institute of Technology and Toyota Technological Institute (KITTI) dataset, PDM-SSD achieves state-of-the-art results for multi-class detection among single-modal methods with an inference speed of 68 frames. We also demonstrate the advantages of PDM-SSD in detecting sparse and incomplete objects through numerous object-level instances. Additionally, PDM can serve as an auxiliary network to establish a connection between sampling points and object centers, thereby improving the accuracy of the model without sacrificing inference speed. Our code will be available at https://github.com/AlanLiangC/PDM-SSD.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07822v1-abstract-full').style.display = 'none'; document.getElementById('2502.07822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07365">arXiv:2502.07365</a> <span> [<a href="https://arxiv.org/pdf/2502.07365">pdf</a>, <a href="https://arxiv.org/format/2502.07365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LongReD: Mitigating Short-Text Degradation of Long-Context Large Language Models via Restoration Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zican Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junyi Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingyu Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07365v2-abstract-short" style="display: inline;"> Large language models (LLMs) have gained extended context windows through scaling positional encodings and lightweight continual pre-training. However, this often leads to degraded performance on short-text tasks, while the reasons for this degradation remain insufficiently explored. In this work, we identify two primary factors contributing to this issue: distribution drift in hidden states and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07365v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07365v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07365v2-abstract-full" style="display: none;"> Large language models (LLMs) have gained extended context windows through scaling positional encodings and lightweight continual pre-training. However, this often leads to degraded performance on short-text tasks, while the reasons for this degradation remain insufficiently explored. In this work, we identify two primary factors contributing to this issue: distribution drift in hidden states and attention scores, and catastrophic forgetting during continual pre-training. To address these challenges, we propose Long Context Pre-training with Restoration Distillation (LongReD), a novel approach designed to mitigate short-text performance degradation through minimizing the distribution discrepancy between the extended and original models. Besides training on long texts, LongReD distills the hidden state of selected layers from the original model on short texts. Additionally, LongReD also introduces a short-to-long distillation, aligning the output distribution on short texts with that on long texts by leveraging skipped positional indices. Experiments on common text benchmarks demonstrate that LongReD effectively preserves the model's short-text performance while maintaining comparable or even better capacity to handle long texts than baselines. Our code is available at https://github.com/RUCAIBox/LongReD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07365v2-abstract-full').style.display = 'none'; document.getElementById('2502.07365v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07337">arXiv:2502.07337</a> <span> [<a href="https://arxiv.org/pdf/2502.07337">pdf</a>, <a href="https://arxiv.org/format/2502.07337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Flow Samplers with Shortcut Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wuhao Chen</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+Z">Zijing Ou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingzhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07337v1-abstract-short" style="display: inline;"> Sampling from unnormalized densities is a fundamental task across various domains. Flow-based samplers generate samples by learning a velocity field that satisfies the continuity equation, but this requires estimating the intractable time derivative of the partition function. While importance sampling provides an approximation, it suffers from high variance. To mitigate this, we introduce a veloci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07337v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07337v1-abstract-full" style="display: none;"> Sampling from unnormalized densities is a fundamental task across various domains. Flow-based samplers generate samples by learning a velocity field that satisfies the continuity equation, but this requires estimating the intractable time derivative of the partition function. While importance sampling provides an approximation, it suffers from high variance. To mitigate this, we introduce a velocity-driven Sequential Monte Carlo method combined with control variates to reduce variance. Additionally, we incorporate a shortcut model to improve efficiency by minimizing the number of sampling steps. Empirical results on both synthetic datasets and $n$-body system targets validate the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07337v1-abstract-full').style.display = 'none'; document.getElementById('2502.07337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07331">arXiv:2502.07331</a> <span> [<a href="https://arxiv.org/pdf/2502.07331">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ERANet: Edge Replacement Augmentation for Semi-Supervised Meniscus Segmentation with Prototype Consistency Alignment and Conditional Self-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyue Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yongcheng Yao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Junru Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shutian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yudong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuihua Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jin Hong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weitian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07331v1-abstract-short" style="display: inline;"> Manual segmentation is labor-intensive, and automatic segmentation remains challenging due to the inherent variability in meniscal morphology, partial volume effects, and low contrast between the meniscus and surrounding tissues. To address these challenges, we propose ERANet, an innovative semi-supervised framework for meniscus segmentation that effectively leverages both labeled and unlabeled im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07331v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07331v1-abstract-full" style="display: none;"> Manual segmentation is labor-intensive, and automatic segmentation remains challenging due to the inherent variability in meniscal morphology, partial volume effects, and low contrast between the meniscus and surrounding tissues. To address these challenges, we propose ERANet, an innovative semi-supervised framework for meniscus segmentation that effectively leverages both labeled and unlabeled images through advanced augmentation and learning strategies. ERANet integrates three key components: edge replacement augmentation (ERA), prototype consistency alignment (PCA), and a conditional self-training (CST) strategy within a mean teacher architecture. ERA introduces anatomically relevant perturbations by simulating meniscal variations, ensuring that augmentations align with the structural context. PCA enhances segmentation performance by aligning intra-class features and promoting compact, discriminative feature representations, particularly in scenarios with limited labeled data. CST improves segmentation robustness by iteratively refining pseudo-labels and mitigating the impact of label noise during training. Together, these innovations establish ERANet as a robust and scalable solution for meniscus segmentation, effectively addressing key barriers to practical implementation. We validated ERANet comprehensively on 3D Double Echo Steady State (DESS) and 3D Fast/Turbo Spin Echo (FSE/TSE) MRI sequences. The results demonstrate the superior performance of ERANet compared to state-of-the-art methods. The proposed framework achieves reliable and accurate segmentation of meniscus structures, even when trained on minimal labeled data. Extensive ablation studies further highlight the synergistic contributions of ERA, PCA, and CST, solidifying ERANet as a transformative solution for semi-supervised meniscus segmentation in medical imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07331v1-abstract-full').style.display = 'none'; document.getElementById('2502.07331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07214">arXiv:2502.07214</a> <span> [<a href="https://arxiv.org/pdf/2502.07214">pdf</a>, <a href="https://arxiv.org/format/2502.07214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Pareto Optimal Algorithmic Recourse in Multi-cost Function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen-Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hong-Chang Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kai-Hung Lin</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Shang-Wei Hwang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao-Tsung Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07214v1-abstract-short" style="display: inline;"> In decision-making systems, algorithmic recourse aims to identify minimal-cost actions to alter an individual features, thereby obtaining a desired outcome. This empowers individuals to understand, question, or alter decisions that negatively affect them. However, due to the variety and sensitivity of system environments and individual personalities, quantifying the cost of a single function is ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07214v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07214v1-abstract-full" style="display: none;"> In decision-making systems, algorithmic recourse aims to identify minimal-cost actions to alter an individual features, thereby obtaining a desired outcome. This empowers individuals to understand, question, or alter decisions that negatively affect them. However, due to the variety and sensitivity of system environments and individual personalities, quantifying the cost of a single function is nearly impossible while considering multiple criteria situations. Most current recourse mechanisms use gradient-based methods that assume cost functions are differentiable, often not applicable in real-world scenarios, resulting in sub-optimal solutions that compromise various criteria. These solutions are typically intractable and lack rigorous theoretical foundations, raising concerns regarding interpretability, reliability, and transparency from the explainable AI (XAI) perspective. To address these issues, this work proposes an algorithmic recourse framework that handles non-differentiable and discrete multi-cost functions. By formulating recourse as a multi-objective optimization problem and assigning weights to different criteria based on their importance, our method identifies Pareto optimal recourse recommendations. To demonstrate scalability, we incorporate the concept of epsilon-net, proving the ability to find approximated Pareto optimal actions. Experiments show the trade-off between different criteria and the methods scalability in large graphs. Compared to current heuristic practices, our approach provides a stronger theoretical foundation and better aligns recourse suggestions with real-world requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07214v1-abstract-full').style.display = 'none'; document.getElementById('2502.07214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07107">arXiv:2502.07107</a> <span> [<a href="https://arxiv.org/pdf/2502.07107">pdf</a>, <a href="https://arxiv.org/format/2502.07107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Framework for Supervised and Unsupervised Segmentation and Classification of Materials Microstructure Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kungang Zhang</a>, <a href="/search/cs?searchtype=author&query=Apley%2C+D+W">Daniel W. Apley</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W+K">Wing K. Liu</a>, <a href="/search/cs?searchtype=author&query=Brinson%2C+L+C">L. Catherine Brinson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07107v1-abstract-short" style="display: inline;"> Microstructure of materials is often characterized through image analysis to understand processing-structure-properties linkages. We propose a largely automated framework that integrates unsupervised and supervised learning methods to classify micrographs according to microstructure phase/class and, for multiphase microstructures, segments them into different homogeneous regions. With the advance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07107v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07107v1-abstract-full" style="display: none;"> Microstructure of materials is often characterized through image analysis to understand processing-structure-properties linkages. We propose a largely automated framework that integrates unsupervised and supervised learning methods to classify micrographs according to microstructure phase/class and, for multiphase microstructures, segments them into different homogeneous regions. With the advance of manufacturing and imaging techniques, the ultra-high resolution of imaging that reveals the complexity of microstructures and the rapidly increasing quantity of images (i.e., micrographs) enables and necessitates a more powerful and automated framework to extract materials characteristics and knowledge. The framework we propose can be used to gradually build a database of microstructure classes relevant to a particular process or group of materials, which can help in analyzing and discovering/identifying new materials. The framework has three steps: (1) segmentation of multiphase micrographs through a recently developed score-based method so that different microstructure homogeneous regions can be identified in an unsupervised manner; (2) {identification and classification of} homogeneous regions of micrographs through an uncertainty-aware supervised classification network trained using the segmented micrographs from Step $1$ with their identified labels verified via the built-in uncertainty quantification and minimal human inspection; (3) supervised segmentation (more powerful than the segmentation in Step $1$) of multiphase microstructures through a segmentation network trained with micrographs and the results from Steps $1$-$2$ using a form of data augmentation. This framework can iteratively characterize/segment new homogeneous or multiphase materials while expanding the database to enhance performance. The framework is demonstrated on various sets of materials and texture images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07107v1-abstract-full').style.display = 'none'; document.getElementById('2502.07107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07062">arXiv:2502.07062</a> <span> [<a href="https://arxiv.org/pdf/2502.07062">pdf</a>, <a href="https://arxiv.org/format/2502.07062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Breaking Barriers: Combinatorial Algorithms for Non-monotone Submodular Maximization with Sublinear Adaptivity and $1/e$ Approximation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixin Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenjing Chen</a>, <a href="/search/cs?searchtype=author&query=Kuhnle%2C+A">Alan Kuhnle</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07062v1-abstract-short" style="display: inline;"> With the rapid growth of data in modern applications, parallel combinatorial algorithms for maximizing non-monotone submodular functions have gained significant attention. The state-of-the-art approximation ratio of $1/e$ is currently achieved only by a continuous algorithm (Ene & Nguyen, 2020) with adaptivity $\mathcal O(\log(n))$. In this work, we focus on size constraints and propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07062v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07062v1-abstract-full" style="display: none;"> With the rapid growth of data in modern applications, parallel combinatorial algorithms for maximizing non-monotone submodular functions have gained significant attention. The state-of-the-art approximation ratio of $1/e$ is currently achieved only by a continuous algorithm (Ene & Nguyen, 2020) with adaptivity $\mathcal O(\log(n))$. In this work, we focus on size constraints and propose a $(1/4-\varepsilon)$-approximation algorithm with high probability for this problem, as well as the first randomized parallel combinatorial algorithm achieving a $1/e-\varepsilon$ approximation ratio, which bridges the gap between continuous and combinatorial approaches. Both algorithms achieve $\mathcal O(\log(n)\log(k))$ adaptivity and $\mathcal O(n\log(n)\log(k))$ query complexity. Empirical results show our algorithms achieve competitive objective values, with the first algorithm particularly efficient in queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07062v1-abstract-full').style.display = 'none'; document.getElementById('2502.07062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06888">arXiv:2502.06888</a> <span> [<a href="https://arxiv.org/pdf/2502.06888">pdf</a>, <a href="https://arxiv.org/format/2502.06888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Klotski: Efficient Mixture-of-Expert Inference via Expert-Aware Multi-Batch Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhiyuan Fang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuegui Huang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zicong Hong</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yufeng Lyu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wuhui Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06888v1-abstract-short" style="display: inline;"> Mixture of Experts (MoE), with its distinctive sparse structure, enables the scaling of language models up to trillions of parameters without significantly increasing computational costs. However, the substantial parameter size presents a challenge for inference, as the expansion in GPU memory cannot keep pace with the growth in parameters. Although offloading techniques utilise memory from the CP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06888v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06888v1-abstract-full" style="display: none;"> Mixture of Experts (MoE), with its distinctive sparse structure, enables the scaling of language models up to trillions of parameters without significantly increasing computational costs. However, the substantial parameter size presents a challenge for inference, as the expansion in GPU memory cannot keep pace with the growth in parameters. Although offloading techniques utilise memory from the CPU and disk and parallelise the I/O and computation for efficiency, the computation for each expert in MoE models is often less than the I/O, resulting in numerous bubbles in the pipeline. Therefore, we propose Klotski, an efficient MoE inference engine that significantly reduces pipeline bubbles through a novel expert-aware multi-batch pipeline paradigm. The proposed paradigm uses batch processing to extend the computation time of the current layer to overlap with the loading time of the next layer. Although this idea has been effectively applied to dense models, more batches may activate more experts in the MoE, leading to longer loading times and more bubbles. Thus, unlike traditional approaches, we balance computation and I/O time and minimise bubbles by orchestrating their inference orders based on their heterogeneous computation and I/O requirements and activation patterns under different batch numbers. Moreover, to adapt to different hardware environments and models, we design a constraint-sensitive I/O-compute planner and a correlation-aware expert prefetcher for a schedule that minimises pipeline bubbles. Experimental results demonstrate that Klotski achieves a superior throughput-latency trade-off compared to state-of-the-art techniques, with throughput improvements of up to 85.12x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06888v1-abstract-full').style.display = 'none'; document.getElementById('2502.06888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06736">arXiv:2502.06736</a> <span> [<a href="https://arxiv.org/pdf/2502.06736">pdf</a>, <a href="https://arxiv.org/format/2502.06736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Low-power Spike-based Wearable Analytics on RRAM Crossbars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhattacharjee%2C+A">Abhiroop Bhattacharjee</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jinquan Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinxin Wang</a>, <a href="/search/cs?searchtype=author&query=Panda%2C+P">Priyadarshini Panda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06736v1-abstract-short" style="display: inline;"> This work introduces a spike-based wearable analytics system utilizing Spiking Neural Networks (SNNs) deployed on an In-memory Computing engine based on RRAM crossbars, which are known for their compactness and energy-efficiency. Given the hardware constraints and noise characteristics of the underlying RRAM crossbars, we propose online adaptation of pre-trained SNNs in real-time using Direct Feed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06736v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06736v1-abstract-full" style="display: none;"> This work introduces a spike-based wearable analytics system utilizing Spiking Neural Networks (SNNs) deployed on an In-memory Computing engine based on RRAM crossbars, which are known for their compactness and energy-efficiency. Given the hardware constraints and noise characteristics of the underlying RRAM crossbars, we propose online adaptation of pre-trained SNNs in real-time using Direct Feedback Alignment (DFA) against traditional backpropagation (BP). Direct Feedback Alignment (DFA) learning, that allows layer-parallel gradient computations, acts as a fast, energy & area-efficient method for online adaptation of SNNs on RRAM crossbars, unleashing better algorithmic performance against those adapted using BP. Through extensive simulations using our in-house hardware evaluation engine called DFA_Sim, we find that DFA achieves upto 64.1% lower energy consumption, 10.1% lower area overhead, and a 2.1x reduction in latency compared to BP, while delivering upto 7.55% higher inference accuracy on human activity recognition (HAR) tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06736v1-abstract-full').style.display = 'none'; document.getElementById('2502.06736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in 2025 IEEE International Symposium on Circuits and Systems (ISCAS)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE International Symposium on Circuits and Systems (ISCAS), 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06693">arXiv:2502.06693</a> <span> [<a href="https://arxiv.org/pdf/2502.06693">pdf</a>, <a href="https://arxiv.org/ps/2502.06693">ps</a>, <a href="https://arxiv.org/format/2502.06693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances, Applications and Open Challenges in Machine Learning for Health: Reflections from Research Roundtables at ML4H 2024 Symposium </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Adibi%2C+A">Amin Adibi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zongliang Ji</a>, <a href="/search/cs?searchtype=author&query=Kaur%2C+J+N">Jivat Neet Kaur</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Winston Chen</a>, <a href="/search/cs?searchtype=author&query=Healey%2C+E">Elizabeth Healey</a>, <a href="/search/cs?searchtype=author&query=Nuwagira%2C+B">Brighton Nuwagira</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Woollard%2C+G">Geoffrey Woollard</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M+A">Maxwell A Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Hejie Cui</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+J">Johnny Xi</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+T">Trenton Chang</a>, <a href="/search/cs?searchtype=author&query=Bikia%2C+V">Vasiliki Bikia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Nicole Zhang</a>, <a href="/search/cs?searchtype=author&query=Noori%2C+A">Ayush Noori</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yuan Xia</a>, <a href="/search/cs?searchtype=author&query=Hossain%2C+M+B">Md. Belal Hossain</a>, <a href="/search/cs?searchtype=author&query=Frank%2C+H+A">Hanna A. Frank</a>, <a href="/search/cs?searchtype=author&query=Peluso%2C+A">Alina Peluso</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yuan Pu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S+Z">Shannon Zejiang Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">John Wu</a>, <a href="/search/cs?searchtype=author&query=Fallahpour%2C+A">Adibvafa Fallahpour</a>, <a href="/search/cs?searchtype=author&query=Mahbub%2C+S">Sazan Mahbub</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06693v1-abstract-short" style="display: inline;"> The fourth Machine Learning for Health (ML4H) symposium was held in person on December 15th and 16th, 2024, in the traditional, ancestral, and unceded territories of the Musqueam, Squamish, and Tsleil-Waututh Nations in Vancouver, British Columbia, Canada. The symposium included research roundtable sessions to foster discussions between participants and senior researchers on timely and relevant to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06693v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06693v1-abstract-full" style="display: none;"> The fourth Machine Learning for Health (ML4H) symposium was held in person on December 15th and 16th, 2024, in the traditional, ancestral, and unceded territories of the Musqueam, Squamish, and Tsleil-Waututh Nations in Vancouver, British Columbia, Canada. The symposium included research roundtable sessions to foster discussions between participants and senior researchers on timely and relevant topics for the ML4H community. The organization of the research roundtables at the conference involved 13 senior and 27 junior chairs across 13 tables. Each roundtable session included an invited senior chair (with substantial experience in the field), junior chairs (responsible for facilitating the discussion), and attendees from diverse backgrounds with an interest in the session's topic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06693v1-abstract-full').style.display = 'none'; document.getElementById('2502.06693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06655">arXiv:2502.06655</a> <span> [<a href="https://arxiv.org/pdf/2502.06655">pdf</a>, <a href="https://arxiv.org/format/2502.06655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unbiased Evaluation of Large Language Models from a Causal Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meilin Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jian Tian</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Liang Ma</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+D">Di Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weijie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06655v1-abstract-short" style="display: inline;"> Benchmark contamination has become a significant concern in the LLM evaluation community. Previous Agents-as-an-Evaluator address this issue by involving agents in the generation of questions. Despite their success, the biases in Agents-as-an-Evaluator methods remain largely unexplored. In this paper, we present a theoretical formulation of evaluation bias, providing valuable insights into designi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06655v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06655v1-abstract-full" style="display: none;"> Benchmark contamination has become a significant concern in the LLM evaluation community. Previous Agents-as-an-Evaluator address this issue by involving agents in the generation of questions. Despite their success, the biases in Agents-as-an-Evaluator methods remain largely unexplored. In this paper, we present a theoretical formulation of evaluation bias, providing valuable insights into designing unbiased evaluation protocols. Furthermore, we identify two type of bias in Agents-as-an-Evaluator through carefully designed probing tasks on a minimal Agents-as-an-Evaluator setup. To address these issues, we propose the Unbiased Evaluator, an evaluation protocol that delivers a more comprehensive, unbiased, and interpretable assessment of LLMs.Extensive experiments reveal significant room for improvement in current LLMs. Additionally, we demonstrate that the Unbiased Evaluator not only offers strong evidence of benchmark contamination but also provides interpretable evaluation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06655v1-abstract-full').style.display = 'none'; document.getElementById('2502.06655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06097">arXiv:2502.06097</a> <span> [<a href="https://arxiv.org/pdf/2502.06097">pdf</a>, <a href="https://arxiv.org/format/2502.06097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701716.3715251">10.1145/3701716.3715251 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> NLGR: Utilizing Neighbor Lists for Generative Rerank in Personalized Recommendation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuli Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xue Wei</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+S">Senjie Kou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenshuai Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Q">Qi Tang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yinhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiong Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingxing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06097v2-abstract-short" style="display: inline;"> Reranking plays a crucial role in modern multi-stage recommender systems by rearranging the initial ranking list. Due to the inherent challenges of combinatorial search spaces, some current research adopts an evaluator-generator paradigm, with a generator generating feasible sequences and an evaluator selecting the best sequence based on the estimated list utility. However, these methods still fac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06097v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06097v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06097v2-abstract-full" style="display: none;"> Reranking plays a crucial role in modern multi-stage recommender systems by rearranging the initial ranking list. Due to the inherent challenges of combinatorial search spaces, some current research adopts an evaluator-generator paradigm, with a generator generating feasible sequences and an evaluator selecting the best sequence based on the estimated list utility. However, these methods still face two issues. Firstly, due to the goal inconsistency problem between the evaluator and generator, the generator tends to fit the local optimal solution of exposure distribution rather than combinatorial space optimization. Secondly, the strategy of generating target items one by one is difficult to achieve optimality because it ignores the information of subsequent items. To address these issues, we propose a utilizing Neighbor Lists model for Generative Reranking (NLGR), which aims to improve the performance of the generator in the combinatorial space. NLGR follows the evaluator-generator paradigm and improves the generator's training and generating methods. Specifically, we use neighbor lists in combination space to enhance the training process, making the generator perceive the relative scores and find the optimization direction. Furthermore, we propose a novel sampling-based non-autoregressive generation method, which allows the generator to jump flexibly from the current list to any neighbor list. Extensive experiments on public and industrial datasets validate NLGR's effectiveness and we have successfully deployed NLGR on the Meituan food delivery platform. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06097v2-abstract-full').style.display = 'none'; document.getElementById('2502.06097v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2025 Industry Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06069">arXiv:2502.06069</a> <span> [<a href="https://arxiv.org/pdf/2502.06069">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713108">10.1145/3706598.3713108 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Living Bento: Heartbeat-Driven Noodles for Enriched Dining Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weijen Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zheng Hu</a>, <a href="/search/cs?searchtype=author&query=Minamizawa%2C+K">Kouta Minamizawa</a>, <a href="/search/cs?searchtype=author&query=Pai%2C+Y+S">Yun Suen Pai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06069v1-abstract-short" style="display: inline;"> To enhance focused eating and dining socialization, previous Human-Food Interaction research has indicated that external devices can support these dining objectives and immersion. However, methods that focus on the food itself and the diners themselves have remained underdeveloped. In this study, we integrated biofeedback with food, utilizing diners' heart rates as a source of the food's appearanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06069v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06069v1-abstract-full" style="display: none;"> To enhance focused eating and dining socialization, previous Human-Food Interaction research has indicated that external devices can support these dining objectives and immersion. However, methods that focus on the food itself and the diners themselves have remained underdeveloped. In this study, we integrated biofeedback with food, utilizing diners' heart rates as a source of the food's appearance to promote focused eating and dining socialization. By employing LED lights, we dynamically displayed diners' real-time physiological signals through the transparency of the food. Results revealed significant effects on various aspects of dining immersion, such as awareness perceptions, attractiveness, attentiveness to each bite, and emotional bonds with the food. Furthermore, to promote dining socialization, we established a "Sharing Bio-Sync Food" dining system to strengthen emotional connections between diners. Based on these findings, we developed tableware that integrates biofeedback into the culinary experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06069v1-abstract-full').style.display = 'none'; document.getElementById('2502.06069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05449">arXiv:2502.05449</a> <span> [<a href="https://arxiv.org/pdf/2502.05449">pdf</a>, <a href="https://arxiv.org/format/2502.05449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Iterative Deepening Sampling for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weizhe Chen</a>, <a href="/search/cs?searchtype=author&query=Koenig%2C+S">Sven Koenig</a>, <a href="/search/cs?searchtype=author&query=Dilkina%2C+B">Bistra Dilkina</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05449v1-abstract-short" style="display: inline;"> The recent release of OpenAI's o1 models and other similar frameworks showcasing test-time scaling laws has demonstrated their exceptional capability to tackle complex reasoning tasks. Inspired by this, subsequent research has revealed that such test-time scaling laws hinge on the model's ability to search both within a single response (intra-response) and across multiple responses (inter-response… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05449v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05449v1-abstract-full" style="display: none;"> The recent release of OpenAI's o1 models and other similar frameworks showcasing test-time scaling laws has demonstrated their exceptional capability to tackle complex reasoning tasks. Inspired by this, subsequent research has revealed that such test-time scaling laws hinge on the model's ability to search both within a single response (intra-response) and across multiple responses (inter-response) during training. Crucially, beyond selecting a single optimal response, the model must also develop robust self-correction capabilities within its own outputs. However, training models to achieve effective self-evaluation and self-correction remains a significant challenge, heavily dependent on the quality of self-reflection data. In this paper, we address this challenge by focusing on enhancing the quality of self-reflection data generation for complex problem-solving, which can subsequently improve the training of next-generation large language models (LLMs). Specifically, we explore how manually triggering a model's self-correction mechanisms can improve performance on challenging reasoning tasks. To this end, we propose a novel iterative deepening sampling algorithm framework designed to enhance self-correction and generate higher-quality samples. Through extensive experiments on Math500 and AIME benchmarks, we demonstrate that our method achieves a higher success rate on difficult tasks and provide detailed ablation studies to analyze its effectiveness across diverse settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05449v1-abstract-full').style.display = 'none'; document.getElementById('2502.05449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04722">arXiv:2502.04722</a> <span> [<a href="https://arxiv.org/pdf/2502.04722">pdf</a>, <a href="https://arxiv.org/format/2502.04722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Singing Voice Conversion with Accompaniment Using Self-Supervised Representation-Based Melody Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+B">Binzhu Sha</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+F">Fan Fan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04722v1-abstract-short" style="display: inline;"> Melody preservation is crucial in singing voice conversion (SVC). However, in many scenarios, audio is often accompanied with background music (BGM), which can cause audio distortion and interfere with the extraction of melody and other key features, significantly degrading SVC performance. Previous methods have attempted to address this by using more robust neural network-based melody extractors,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04722v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04722v1-abstract-full" style="display: none;"> Melody preservation is crucial in singing voice conversion (SVC). However, in many scenarios, audio is often accompanied with background music (BGM), which can cause audio distortion and interfere with the extraction of melody and other key features, significantly degrading SVC performance. Previous methods have attempted to address this by using more robust neural network-based melody extractors, but their performance drops sharply in the presence of complex accompaniment. Other approaches involve performing source separation before conversion, but this often introduces noticeable artifacts, leading to a significant drop in conversion quality and increasing the user's operational costs. To address these issues, we introduce a novel SVC method that uses self-supervised representation-based melody features to improve melody modeling accuracy in the presence of BGM. In our experiments, we compare the effectiveness of different self-supervised learning (SSL) models for melody extraction and explore for the first time how SSL benefits the task of melody extraction. The experimental results demonstrate that our proposed SVC model significantly outperforms existing baseline methods in terms of melody accuracy and shows higher similarity and naturalness in both subjective and objective evaluations across noisy and clean audio environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04722v1-abstract-full').style.display = 'none'; document.getElementById('2502.04722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03506">arXiv:2502.03506</a> <span> [<a href="https://arxiv.org/pdf/2502.03506">pdf</a>, <a href="https://arxiv.org/format/2502.03506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimistic 蔚-Greedy Exploration for Cooperative Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siying Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03506v1-abstract-short" style="display: inline;"> The Centralized Training with Decentralized Execution (CTDE) paradigm is widely used in cooperative multi-agent reinforcement learning. However, due to the representational limitations of traditional monotonic value decomposition methods, algorithms can underestimate optimal actions, leading policies to suboptimal solutions. To address this challenge, we propose Optimistic $蔚$-Greedy Exploration,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03506v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03506v1-abstract-full" style="display: none;"> The Centralized Training with Decentralized Execution (CTDE) paradigm is widely used in cooperative multi-agent reinforcement learning. However, due to the representational limitations of traditional monotonic value decomposition methods, algorithms can underestimate optimal actions, leading policies to suboptimal solutions. To address this challenge, we propose Optimistic $蔚$-Greedy Exploration, focusing on enhancing exploration to correct value estimations. The underestimation arises from insufficient sampling of optimal actions during exploration, as our analysis indicated. We introduce an optimistic updating network to identify optimal actions and sample actions from its distribution with a probability of $蔚$ during exploration, increasing the selection frequency of optimal actions. Experimental results in various environments reveal that the Optimistic $蔚$-Greedy Exploration effectively prevents the algorithm from suboptimal solutions and significantly improves its performance compared to other algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03506v1-abstract-full').style.display = 'none'; document.getElementById('2502.03506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03228">arXiv:2502.03228</a> <span> [<a href="https://arxiv.org/pdf/2502.03228">pdf</a>, <a href="https://arxiv.org/format/2502.03228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GARAD-SLAM: 3D GAussian splatting for Real-time Anti Dynamic SLAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingrui Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weijian Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Na Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03228v2-abstract-short" style="display: inline;"> The 3D Gaussian Splatting (3DGS)-based SLAM system has garnered widespread attention due to its excellent performance in real-time high-fidelity rendering. However, in real-world environments with dynamic objects, existing 3DGS-based SLAM systems often face mapping errors and tracking drift issues. To address these problems, we propose GARAD-SLAM, a real-time 3DGS-based SLAM system tailored for dy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03228v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03228v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03228v2-abstract-full" style="display: none;"> The 3D Gaussian Splatting (3DGS)-based SLAM system has garnered widespread attention due to its excellent performance in real-time high-fidelity rendering. However, in real-world environments with dynamic objects, existing 3DGS-based SLAM systems often face mapping errors and tracking drift issues. To address these problems, we propose GARAD-SLAM, a real-time 3DGS-based SLAM system tailored for dynamic scenes. In terms of tracking, unlike traditional methods, we directly perform dynamic segmentation on Gaussians and map them back to the front-end to obtain dynamic point labels through a Gaussian pyramid network, achieving precise dynamic removal and robust tracking. For mapping, we impose rendering penalties on dynamically labeled Gaussians, which are updated through the network, to avoid irreversible erroneous removal caused by simple pruning. Our results on real-world datasets demonstrate that our method is competitive in tracking compared to baseline methods, generating fewer artifacts and higher-quality reconstructions in rendering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03228v2-abstract-full').style.display = 'none'; document.getElementById('2502.03228v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper was accepted by ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03125">arXiv:2502.03125</a> <span> [<a href="https://arxiv.org/pdf/2502.03125">pdf</a>, <a href="https://arxiv.org/format/2502.03125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Double Distillation Network for Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siying Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03125v1-abstract-short" style="display: inline;"> Multi-agent reinforcement learning typically employs a centralized training-decentralized execution (CTDE) framework to alleviate the non-stationarity in environment. However, the partial observability during execution may lead to cumulative gap errors gathered by agents, impairing the training of effective collaborative policies. To overcome this challenge, we introduce the Double Distillation Ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03125v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03125v1-abstract-full" style="display: none;"> Multi-agent reinforcement learning typically employs a centralized training-decentralized execution (CTDE) framework to alleviate the non-stationarity in environment. However, the partial observability during execution may lead to cumulative gap errors gathered by agents, impairing the training of effective collaborative policies. To overcome this challenge, we introduce the Double Distillation Network (DDN), which incorporates two distillation modules aimed at enhancing robust coordination and facilitating the collaboration process under constrained information. The external distillation module uses a global guiding network and a local policy network, employing distillation to reconcile the gap between global training and local execution. In addition, the internal distillation module introduces intrinsic rewards, drawn from state information, to enhance the exploration capabilities of agents. Extensive experiments demonstrate that DDN significantly improves performance across multiple scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03125v1-abstract-full').style.display = 'none'; document.getElementById('2502.03125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02875">arXiv:2502.02875</a> <span> [<a href="https://arxiv.org/pdf/2502.02875">pdf</a>, <a href="https://arxiv.org/format/2502.02875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Value Decomposition Policy Fusion for Multi-Agent Cooperation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siying Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jinliang Shao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuhua Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02875v1-abstract-short" style="display: inline;"> Value decomposition (VD) has become one of the most prominent solutions in cooperative multi-agent reinforcement learning. Most existing methods generally explore how to factorize the joint value and minimize the discrepancies between agent observations and characteristics of environmental states. However, direct decomposition may result in limited representation or difficulty in optimization. Ort… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02875v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02875v1-abstract-full" style="display: none;"> Value decomposition (VD) has become one of the most prominent solutions in cooperative multi-agent reinforcement learning. Most existing methods generally explore how to factorize the joint value and minimize the discrepancies between agent observations and characteristics of environmental states. However, direct decomposition may result in limited representation or difficulty in optimization. Orthogonal to designing a new factorization scheme, in this paper, we propose Heterogeneous Policy Fusion (HPF) to integrate the strengths of various VD methods. We construct a composite policy set to select policies for interaction adaptively. Specifically, this adaptive mechanism allows agents' trajectories to benefit from diverse policy transitions while incorporating the advantages of each factorization method. Additionally, HPF introduces a constraint between these heterogeneous policies to rectify the misleading update caused by the unexpected exploratory or suboptimal non-cooperation. Experimental results on cooperative tasks show HPF's superior performance over multiple baselines, proving its effectiveness and ease of implementation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02875v1-abstract-full').style.display = 'none'; document.getElementById('2502.02875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01718">arXiv:2502.01718</a> <span> [<a href="https://arxiv.org/pdf/2502.01718">pdf</a>, <a href="https://arxiv.org/format/2502.01718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ACECODER: Acing Coder RL via Automated Test-Case Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Huaye Zeng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongfu Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haozhe Wang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+P">Ping Nie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaotong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01718v3-abstract-short" style="display: inline;"> Most progress in recent coder models has been driven by supervised fine-tuning (SFT), while the potential of reinforcement learning (RL) remains largely unexplored, primarily due to the lack of reliable reward data/model in the code domain. In this paper, we address this challenge by leveraging automated large-scale test-case synthesis to enhance code model training. Specifically, we design a pipe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01718v3-abstract-full').style.display = 'inline'; document.getElementById('2502.01718v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01718v3-abstract-full" style="display: none;"> Most progress in recent coder models has been driven by supervised fine-tuning (SFT), while the potential of reinforcement learning (RL) remains largely unexplored, primarily due to the lack of reliable reward data/model in the code domain. In this paper, we address this challenge by leveraging automated large-scale test-case synthesis to enhance code model training. Specifically, we design a pipeline that generates extensive (question, test-cases) pairs from existing code data. Using these test cases, we construct preference pairs based on pass rates over sampled programs to train reward models with Bradley-Terry loss. It shows an average of 10-point improvement for Llama-3.1-8B-Ins and 5-point improvement for Qwen2.5-Coder-7B-Ins through best-of-32 sampling, making the 7B model on par with 236B DeepSeek-V2.5. Furthermore, we conduct reinforcement learning with both reward models and test-case pass rewards, leading to consistent improvements across HumanEval, MBPP, BigCodeBench, and LiveCodeBench (V4). Notably, we follow the R1-style training to start from Qwen2.5-Coder-base directly and show that our RL training can improve model on HumanEval-plus by over 25\% and MBPP-plus by 6\% for merely 80 optimization steps. We believe our results highlight the huge potential of reinforcement learning in coder models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01718v3-abstract-full').style.display = 'none'; document.getElementById('2502.01718v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figure, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01522">arXiv:2502.01522</a> <span> [<a href="https://arxiv.org/pdf/2502.01522">pdf</a>, <a href="https://arxiv.org/format/2502.01522">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BD-Diff: Generative Diffusion Model for Image Deblurring on Unknown Domains with Blur-Decoupled Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junhao Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Ting Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xi Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01522v1-abstract-short" style="display: inline;"> Generative diffusion models trained on large-scale datasets have achieved remarkable progress in image synthesis. In favor of their ability to supplement missing details and generate aesthetically pleasing contents, recent works have applied them to image deblurring tasks via training an adapter on blurry-sharp image pairs to provide structural conditions for restoration. However, acquiring substa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01522v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01522v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01522v1-abstract-full" style="display: none;"> Generative diffusion models trained on large-scale datasets have achieved remarkable progress in image synthesis. In favor of their ability to supplement missing details and generate aesthetically pleasing contents, recent works have applied them to image deblurring tasks via training an adapter on blurry-sharp image pairs to provide structural conditions for restoration. However, acquiring substantial amounts of realistic paired data is challenging and costly in real-world scenarios. On the other hand, relying solely on synthetic data often results in overfitting, leading to unsatisfactory performance when confronted with unseen blur patterns. To tackle this issue, we propose BD-Diff, a generative-diffusion-based model designed to enhance deblurring performance on unknown domains by decoupling structural features and blur patterns through joint training on three specially designed tasks. We employ two Q-Formers as structural representations and blur patterns extractors separately. The features extracted by them will be used for the supervised deblurring task on synthetic data and the unsupervised blur-transfer task by leveraging unpaired blurred images from the target domain simultaneously. Furthermore, we introduce a reconstruction task to make the structural features and blur patterns complementary. This blur-decoupled learning process enhances the generalization capabilities of BD-Diff when encountering unknown domain blur patterns. Experiments on real-world datasets demonstrate that BD-Diff outperforms existing state-of-the-art methods in blur removal and structural preservation in various challenging scenarios. The codes will be released in https://github.com/donahowe/BD-Diff <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01522v1-abstract-full').style.display = 'none'; document.getElementById('2502.01522v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We propose BD-Diff to integrate generative diffusion model into unpaired deblurring tasks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01456">arXiv:2502.01456</a> <span> [<a href="https://arxiv.org/pdf/2502.01456">pdf</a>, <a href="https://arxiv.org/format/2502.01456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Process Reinforcement through Implicit Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+G">Ganqu Cui</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Lifan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zefan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanbin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wendi Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingxiang He</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tianyu Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qixin Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiarui Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01456v1-abstract-short" style="display: inline;"> Dense process rewards have proven a more effective alternative to the sparse outcome-level rewards in the inference-time scaling of large language models (LLMs), particularly in tasks requiring complex multi-step reasoning. While dense rewards also offer an appealing choice for the reinforcement learning (RL) of LLMs since their fine-grained rewards have the potential to address some inherent issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01456v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01456v1-abstract-full" style="display: none;"> Dense process rewards have proven a more effective alternative to the sparse outcome-level rewards in the inference-time scaling of large language models (LLMs), particularly in tasks requiring complex multi-step reasoning. While dense rewards also offer an appealing choice for the reinforcement learning (RL) of LLMs since their fine-grained rewards have the potential to address some inherent issues of outcome rewards, such as training efficiency and credit assignment, this potential remains largely unrealized. This can be primarily attributed to the challenges of training process reward models (PRMs) online, where collecting high-quality process labels is prohibitively expensive, making them particularly vulnerable to reward hacking. To address these challenges, we propose PRIME (Process Reinforcement through IMplicit rEwards), which enables online PRM updates using only policy rollouts and outcome labels through implict process rewards. PRIME combines well with various advantage functions and forgoes the dedicated reward model training phrase that existing approaches require, substantially reducing the development overhead. We demonstrate PRIME's effectiveness on competitional math and coding. Starting from Qwen2.5-Math-7B-Base, PRIME achieves a 15.1% average improvement across several key reasoning benchmarks over the SFT model. Notably, our resulting model, Eurus-2-7B-PRIME, surpasses Qwen2.5-Math-7B-Instruct on seven reasoning benchmarks with 10% of its training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01456v1-abstract-full').style.display = 'none'; document.getElementById('2502.01456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages. Model&Code&Data available at https://github.com/PRIME-RL/PRIME</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00963">arXiv:2502.00963</a> <span> [<a href="https://arxiv.org/pdf/2502.00963">pdf</a>, <a href="https://arxiv.org/format/2502.00963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PDE-Controller: LLMs for Autoformalization and Reasoning of PDEs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soroco%2C+M">Mauricio Soroco</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jialin Song</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Emond%2C+K">Kye Emond</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weiran Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wuyang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00963v1-abstract-short" style="display: inline;"> While recent AI-for-math has made strides in pure mathematics, areas of applied mathematics, particularly PDEs, remain underexplored despite their significant real-world applications. We present PDE-Controller, a framework that enables large language models (LLMs) to control systems governed by partial differential equations (PDEs). Our approach enables LLMs to transform informal natural language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00963v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00963v1-abstract-full" style="display: none;"> While recent AI-for-math has made strides in pure mathematics, areas of applied mathematics, particularly PDEs, remain underexplored despite their significant real-world applications. We present PDE-Controller, a framework that enables large language models (LLMs) to control systems governed by partial differential equations (PDEs). Our approach enables LLMs to transform informal natural language instructions into formal specifications, and then execute reasoning and planning steps to improve the utility of PDE control. We build a holistic solution comprising datasets (both human-written cases and 2 million synthetic samples), math-reasoning models, and novel evaluation metrics, all of which require significant effort. Our PDE-Controller significantly outperforms prompting the latest open-source and GPT models in reasoning, autoformalization, and program synthesis, achieving up to a 62% improvement in utility gain for PDE control. By bridging the gap between language generation and PDE systems, we demonstrate the potential of LLMs in addressing complex scientific and engineering challenges. We will release all data, model checkpoints, and code at https://pde-controller.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00963v1-abstract-full').style.display = 'none'; document.getElementById('2502.00963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00646">arXiv:2502.00646</a> <span> [<a href="https://arxiv.org/pdf/2502.00646">pdf</a>, <a href="https://arxiv.org/format/2502.00646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TrojanTime: Backdoor Attacks on Time Series Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+C">Chang Dong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zechao Sun</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+G">Guangdong Bai</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+S">Shuying Piao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weitong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W+E">Wei Emma Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00646v1-abstract-short" style="display: inline;"> Time Series Classification (TSC) is highly vulnerable to backdoor attacks, posing significant security threats. Existing methods primarily focus on data poisoning during the training phase, designing sophisticated triggers to improve stealthiness and attack success rate (ASR). However, in practical scenarios, attackers often face restrictions in accessing training data. Moreover, it is a challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00646v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00646v1-abstract-full" style="display: none;"> Time Series Classification (TSC) is highly vulnerable to backdoor attacks, posing significant security threats. Existing methods primarily focus on data poisoning during the training phase, designing sophisticated triggers to improve stealthiness and attack success rate (ASR). However, in practical scenarios, attackers often face restrictions in accessing training data. Moreover, it is a challenge for the model to maintain generalization ability on clean test data while remaining vulnerable to poisoned inputs when data is inaccessible. To address these challenges, we propose TrojanTime, a novel two-step training algorithm. In the first stage, we generate a pseudo-dataset using an external arbitrary dataset through target adversarial attacks. The clean model is then continually trained on this pseudo-dataset and its poisoned version. To ensure generalization ability, the second stage employs a carefully designed training strategy, combining logits alignment and batch norm freezing. We evaluate TrojanTime using five types of triggers across four TSC architectures in UCR benchmark datasets from diverse domains. The results demonstrate the effectiveness of TrojanTime in executing backdoor attacks while maintaining clean accuracy. Finally, to mitigate this threat, we propose a defensive unlearning strategy that effectively reduces the ASR while preserving clean accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00646v1-abstract-full').style.display = 'none'; document.getElementById('2502.00646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures, 3 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00314">arXiv:2502.00314</a> <span> [<a href="https://arxiv.org/pdf/2502.00314">pdf</a>, <a href="https://arxiv.org/format/2502.00314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Study on the Performance of U-Net Modifications in Retroperitoneal Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heidari%2C+M">Moein Heidari</a>, <a href="/search/cs?searchtype=author&query=Aghdam%2C+E+K">Ehsan Khodapanah Aghdam</a>, <a href="/search/cs?searchtype=author&query=Manzella%2C+A">Alexander Manzella</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+D">Daniel Hsu</a>, <a href="/search/cs?searchtype=author&query=Scalabrino%2C+R">Rebecca Scalabrino</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenjin Chen</a>, <a href="/search/cs?searchtype=author&query=Foran%2C+D+J">David J. Foran</a>, <a href="/search/cs?searchtype=author&query=Hacihaliloglu%2C+I">Ilker Hacihaliloglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00314v1-abstract-short" style="display: inline;"> The retroperitoneum hosts a variety of tumors, including rare benign and malignant types, which pose diagnostic and treatment challenges due to their infrequency and proximity to vital structures. Estimating tumor volume is difficult due to their irregular shapes, and manual segmentation is time-consuming. Automatic segmentation using U-Net and its variants, incorporating Vision Transformer (ViT)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00314v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00314v1-abstract-full" style="display: none;"> The retroperitoneum hosts a variety of tumors, including rare benign and malignant types, which pose diagnostic and treatment challenges due to their infrequency and proximity to vital structures. Estimating tumor volume is difficult due to their irregular shapes, and manual segmentation is time-consuming. Automatic segmentation using U-Net and its variants, incorporating Vision Transformer (ViT) elements, has shown promising results but struggles with high computational demands. To address this, architectures like the Mamba State Space Model (SSM) and Extended Long-Short Term Memory (xLSTM) offer efficient solutions by handling long-range dependencies with lower resource consumption. This study evaluates U-Net enhancements, including CNN, ViT, Mamba, and xLSTM, on a new in-house CT dataset and a public organ segmentation dataset. The proposed ViLU-Net model integrates Vi-blocks for improved segmentation. Results highlight xLSTM's efficiency in the U-Net framework. The code is publicly accessible on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00314v1-abstract-full').style.display = 'none'; document.getElementById('2502.00314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at the 2025 SPIE Medical Imaging Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19339">arXiv:2501.19339</a> <span> [<a href="https://arxiv.org/pdf/2501.19339">pdf</a>, <a href="https://arxiv.org/format/2501.19339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PixelWorld: Towards Perceiving Everything as Pixels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+Z">Zhiheng Lyu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xueguang Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19339v1-abstract-short" style="display: inline;"> Existing foundation models typically process visual input as pixels and textual input as tokens, a paradigm that contrasts with human perception, where both modalities are processed in a unified manner. With the rise of embodied and agentic AI, where inputs primarily come from camera pixels, the need for a unified perception framework becomes increasingly evident. In this paper, we propose to unif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19339v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19339v1-abstract-full" style="display: none;"> Existing foundation models typically process visual input as pixels and textual input as tokens, a paradigm that contrasts with human perception, where both modalities are processed in a unified manner. With the rise of embodied and agentic AI, where inputs primarily come from camera pixels, the need for a unified perception framework becomes increasingly evident. In this paper, we propose to unify all modalities (text, tables, code, diagrams, images, etc) as pixel inputs, i.e. "Perceive Everything as Pixels" (PEAP). We introduce PixelWorld, a novel evaluation suite that unifies all the mentioned modalities into pixel space to gauge the existing models' performance. Our findings show that (1) PEAP outperforms baseline with token-based input in multimodal datasets, benefiting from unified input for better disambiguation, (2) significant declines in reasoning and coding capabilities across all models when processing pixel-based input, underscoring the need to enhance foundation models' perceptual abilities, (3) larger models can maintain strong performance on non-reasoning tasks under PEAP, while smaller models like Phi-3.5-V suffer significant performance degradation, (4) the attention pattern of PEAP is highly aligned with text token input, (5) PEAP can be accelerated significantly by exploiting the spatial sparsity. We conclude that the existing frontier models are competent in pixel perception, however, there is still headroom for improvement. Our code, dataset will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19339v1-abstract-full').style.display = 'none'; document.getElementById('2501.19339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19300">arXiv:2501.19300</a> <span> [<a href="https://arxiv.org/pdf/2501.19300">pdf</a>, <a href="https://arxiv.org/format/2501.19300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Offline Learning for Combinatorial Multi-armed Bandits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xutong Liu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiangxiang Dai</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jinhang Zuo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+C">Carlee-Joe Wong</a>, <a href="/search/cs?searchtype=author&query=Lui%2C+J+C+S">John C. S. Lui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19300v1-abstract-short" style="display: inline;"> The combinatorial multi-armed bandit (CMAB) is a fundamental sequential decision-making framework, extensively studied over the past decade. However, existing work primarily focuses on the online setting, overlooking the substantial costs of online interactions and the readily available offline datasets. To overcome these limitations, we introduce Off-CMAB, the first offline learning framework for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19300v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19300v1-abstract-full" style="display: none;"> The combinatorial multi-armed bandit (CMAB) is a fundamental sequential decision-making framework, extensively studied over the past decade. However, existing work primarily focuses on the online setting, overlooking the substantial costs of online interactions and the readily available offline datasets. To overcome these limitations, we introduce Off-CMAB, the first offline learning framework for CMAB. Central to our framework is the combinatorial lower confidence bound (CLCB) algorithm, which combines pessimistic reward estimations with combinatorial solvers. To characterize the quality of offline datasets, we propose two novel data coverage conditions and prove that, under these conditions, CLCB achieves a near-optimal suboptimality gap, matching the theoretical lower bound up to a logarithmic factor. We validate Off-CMAB through practical applications, including learning to rank, large language model (LLM) caching, and social influence maximization, showing its ability to handle nonlinear reward functions, general feedback models, and out-of-distribution action samples that excludes optimal or even feasible actions. Extensive experiments on synthetic and real-world datasets further highlight the superior performance of CLCB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19300v1-abstract-full').style.display = 'none'; document.getElementById('2501.19300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19160">arXiv:2501.19160</a> <span> [<a href="https://arxiv.org/pdf/2501.19160">pdf</a>, <a href="https://arxiv.org/format/2501.19160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RMDM: Radio Map Diffusion Model with Physics Informed </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+H">Haozhe Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenshuo Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhihui Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hongru Xiao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+N">Nanqian Jia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Keming Wu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Songning Lai</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yutao Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19160v1-abstract-short" style="display: inline;"> With the rapid development of wireless communication technology, the efficient utilization of spectrum resources, optimization of communication quality, and intelligent communication have become critical. Radio map reconstruction is essential for enabling advanced applications, yet challenges such as complex signal propagation and sparse data hinder accurate reconstruction. To address these issues… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19160v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19160v1-abstract-full" style="display: none;"> With the rapid development of wireless communication technology, the efficient utilization of spectrum resources, optimization of communication quality, and intelligent communication have become critical. Radio map reconstruction is essential for enabling advanced applications, yet challenges such as complex signal propagation and sparse data hinder accurate reconstruction. To address these issues, we propose the **Radio Map Diffusion Model (RMDM)**, a physics-informed framework that integrates **Physics-Informed Neural Networks (PINNs)** to incorporate constraints like the **Helmholtz equation**. RMDM employs a dual U-Net architecture: the first ensures physical consistency by minimizing PDE residuals, boundary conditions, and source constraints, while the second refines predictions via diffusion-based denoising. By leveraging physical laws, RMDM significantly enhances accuracy, robustness, and generalization. Experiments demonstrate that RMDM outperforms state-of-the-art methods, achieving **NMSE of 0.0031** and **RMSE of 0.0125** under the Static RM (SRM) setting, and **NMSE of 0.0047** and **RMSE of 0.0146** under the Dynamic RM (DRM) setting. These results establish a novel paradigm for integrating physics-informed and data-driven approaches in radio map reconstruction, particularly under sparse data conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19160v1-abstract-full').style.display = 'none'; document.getElementById('2501.19160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19094">arXiv:2501.19094</a> <span> [<a href="https://arxiv.org/pdf/2501.19094">pdf</a>, <a href="https://arxiv.org/format/2501.19094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Ambient Denoising Diffusion Generative Adversarial Networks for Establishing Stochastic Object Models from Noisy Image Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xichen Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wentao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weimin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19094v1-abstract-short" style="display: inline;"> It is widely accepted that medical imaging systems should be objectively assessed via task-based image quality (IQ) measures that ideally account for all sources of randomness in the measured image data, including the variation in the ensemble of objects to be imaged. Stochastic object models (SOMs) that can randomly draw samples from the object distribution can be employed to characterize object… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19094v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19094v1-abstract-full" style="display: none;"> It is widely accepted that medical imaging systems should be objectively assessed via task-based image quality (IQ) measures that ideally account for all sources of randomness in the measured image data, including the variation in the ensemble of objects to be imaged. Stochastic object models (SOMs) that can randomly draw samples from the object distribution can be employed to characterize object variability. To establish realistic SOMs for task-based IQ analysis, it is desirable to employ experimental image data. However, experimental image data acquired from medical imaging systems are subject to measurement noise. Previous work investigated the ability of deep generative models (DGMs) that employ an augmented generative adversarial network (GAN), AmbientGAN, for establishing SOMs from noisy measured image data. Recently, denoising diffusion models (DDMs) have emerged as a leading DGM for image synthesis and can produce superior image quality than GANs. However, original DDMs possess a slow image-generation process because of the Gaussian assumption in the denoising steps. More recently, denoising diffusion GAN (DDGAN) was proposed to permit fast image generation while maintain high generated image quality that is comparable to the original DDMs. In this work, we propose an augmented DDGAN architecture, Ambient DDGAN (ADDGAN), for learning SOMs from noisy image data. Numerical studies that consider clinical computed tomography (CT) images and digital breast tomosynthesis (DBT) images are conducted. The ability of the proposed ADDGAN to learn realistic SOMs from noisy image data is demonstrated. It has been shown that the ADDGAN significantly outperforms the advanced AmbientGAN models for synthesizing high resolution medical images with complex textures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19094v1-abstract-full').style.display = 'none'; document.getElementById('2501.19094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SPIE Medical Imaging 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18453">arXiv:2501.18453</a> <span> [<a href="https://arxiv.org/pdf/2501.18453">pdf</a>, <a href="https://arxiv.org/format/2501.18453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Transfer Learning for Keypoint Detection in Low-Resolution Thermal TUG Test Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Lun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Chia-Yeh Hsieh</a>, <a href="/search/cs?searchtype=author&query=Kao%2C+Y">Yu-Hsiang Kao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai-Chun Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sheng-Yu Peng</a>, <a href="/search/cs?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18453v1-abstract-short" style="display: inline;"> This study presents a novel approach to human keypoint detection in low-resolution thermal images using transfer learning techniques. We introduce the first application of the Timed Up and Go (TUG) test in thermal image computer vision, establishing a new paradigm for mobility assessment. Our method leverages a MobileNetV3-Small encoder and a ViTPose decoder, trained using a composite loss functio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18453v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18453v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18453v1-abstract-full" style="display: none;"> This study presents a novel approach to human keypoint detection in low-resolution thermal images using transfer learning techniques. We introduce the first application of the Timed Up and Go (TUG) test in thermal image computer vision, establishing a new paradigm for mobility assessment. Our method leverages a MobileNetV3-Small encoder and a ViTPose decoder, trained using a composite loss function that balances latent representation alignment and heatmap accuracy. The model was evaluated using the Object Keypoint Similarity (OKS) metric from the COCO Keypoint Detection Challenge. The proposed model achieves better performance with AP, AP50, and AP75 scores of 0.861, 0.942, and 0.887 respectively, outperforming traditional supervised learning approaches like Mask R-CNN and ViTPose-Base. Moreover, our model demonstrates superior computational efficiency in terms of parameter count and FLOPS. This research lays a solid foundation for future clinical applications of thermal imaging in mobility assessment and rehabilitation monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18453v1-abstract-full').style.display = 'none'; document.getElementById('2501.18453v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AICAS 2025. This is the preprint version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18418">arXiv:2501.18418</a> <span> [<a href="https://arxiv.org/pdf/2501.18418">pdf</a>, <a href="https://arxiv.org/format/2501.18418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Task-based Regularization in Penalized Least-Squares for Binary Signal Detection Tasks in Medical Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wentao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianming Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weimin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18418v2-abstract-short" style="display: inline;"> Image denoising algorithms have been extensively investigated for medical imaging. To perform image denoising, penalized least-squares (PLS) problems can be designed and solved, in which the penalty term encodes prior knowledge of the object being imaged. Sparsity-promoting penalties, such as total variation (TV), have been a popular choice for regularizing image denoising problems. However, such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18418v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18418v2-abstract-full" style="display: none;"> Image denoising algorithms have been extensively investigated for medical imaging. To perform image denoising, penalized least-squares (PLS) problems can be designed and solved, in which the penalty term encodes prior knowledge of the object being imaged. Sparsity-promoting penalties, such as total variation (TV), have been a popular choice for regularizing image denoising problems. However, such hand-crafted penalties may not be able to preserve task-relevant information in measured image data and can lead to oversmoothed image appearances and patchy artifacts that degrade signal detectability. Supervised learning methods that employ convolutional neural networks (CNNs) have emerged as a popular approach to denoising medical images. However, studies have shown that CNNs trained with loss functions based on traditional image quality measures can lead to a loss of task-relevant information in images. Some previous works have investigated task-based loss functions that employ model observers for training the CNN denoising models. However, such training processes typically require a large number of noisy and ground-truth (noise-free or low-noise) image data pairs. In this work, we propose a task-based regularization strategy for use with PLS in medical image denoising. The proposed task-based regularization is associated with the likelihood of linear test statistics of noisy images for Gaussian noise models. The proposed method does not require ground-truth image data and solves an individual optimization problem for denoising each image. Computer-simulation studies are conducted that consider a multivariate-normally distributed (MVN) lumpy background and a binary texture background. It is demonstrated that the proposed regularization strategy can effectively improve signal detectability in denoised images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18418v2-abstract-full').style.display = 'none'; document.getElementById('2501.18418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SPIE Medical Imaging 2025</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>