CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,440 results for author: <span class="mathjax">Yang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v1-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v1-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'none'; document.getElementById('2502.10248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08882">arXiv:2502.08882</a> <span> [<a href="https://arxiv.org/pdf/2502.08882">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> 2D Integrated Bayesian Tomography of Plasma Electron Density Profile for HL-3 Based on Gaussian Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Renjie Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongyu Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhijun Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yixiong Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08882v1-abstract-short" style="display: inline;"> This paper introduces an integrated Bayesian model that combines line integral measurements and point values using Gaussian Process (GP). The proposed method leverages Gaussian Process Regression (GPR) to incorporate point values into 2D profiles and employs coordinate mapping to integrate magnetic flux information for 2D inversion. The average relative error of the reconstructed profile, using th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08882v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08882v1-abstract-full" style="display: none;"> This paper introduces an integrated Bayesian model that combines line integral measurements and point values using Gaussian Process (GP). The proposed method leverages Gaussian Process Regression (GPR) to incorporate point values into 2D profiles and employs coordinate mapping to integrate magnetic flux information for 2D inversion. The average relative error of the reconstructed profile, using the integrated Bayesian tomography model with normalized magnetic flux, is as low as 3.60*10^(-4). Additionally, sensitivity tests were conducted on the number of grids, the standard deviation of synthetic diagnostic data, and noise levels, laying a solid foundation for the application of the model to experimental data. This work not only achieves accurate 2D inversion using the integrated Bayesian model but also provides a robust framework for decoupling pressure information from equilibrium reconstruction, thus making it possible to optimize equilibrium reconstruction using inversion results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08882v1-abstract-full').style.display = 'none'; document.getElementById('2502.08882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08820">arXiv:2502.08820</a> <span> [<a href="https://arxiv.org/pdf/2502.08820">pdf</a>, <a href="https://arxiv.org/format/2502.08820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can a Single Model Master Both Multi-turn Conversations and Tool Use? CALM: A Unified Conversational Agentic Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Acikgoz%2C+E+C">Emre Can Acikgoz</a>, <a href="/search/cs?searchtype=author&query=Greer%2C+J">Jeremiah Greer</a>, <a href="/search/cs?searchtype=author&query=Datta%2C+A">Akul Datta</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ze Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">William Zeng</a>, <a href="/search/cs?searchtype=author&query=Elachqar%2C+O">Oussama Elachqar</a>, <a href="/search/cs?searchtype=author&query=Koukoumidis%2C+E">Emmanouil Koukoumidis</a>, <a href="/search/cs?searchtype=author&query=Hakkani-T%C3%BCr%2C+D">Dilek Hakkani-T眉r</a>, <a href="/search/cs?searchtype=author&query=Tur%2C+G">Gokhan Tur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08820v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) with API-calling capabilities enabled building effective Language Agents (LA), while also revolutionizing the conventional task-oriented dialogue (TOD) paradigm. However, current approaches face a critical dilemma: TOD systems are often trained on a limited set of target APIs, requiring new data to maintain their quality when interfacing with new services, while LAs ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08820v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08820v1-abstract-full" style="display: none;"> Large Language Models (LLMs) with API-calling capabilities enabled building effective Language Agents (LA), while also revolutionizing the conventional task-oriented dialogue (TOD) paradigm. However, current approaches face a critical dilemma: TOD systems are often trained on a limited set of target APIs, requiring new data to maintain their quality when interfacing with new services, while LAs are not trained to maintain user intent over multi-turn conversations. Because both robust multi-turn management and advanced function calling are crucial for effective conversational agents, we evaluate these skills on three popular benchmarks: MultiWOZ 2.4 (TOD), BFCL V3 (LA), and API-Bank (LA), and our analyses reveal that specialized approaches excel in one domain but underperform in the other. To bridge this chasm, we introduce CALM (Conversational Agentic Language Model), a unified approach that integrates both conversational and agentic capabilities. We created CALM-IT, a carefully constructed multi-task dataset that interleave multi-turn ReAct reasoning with complex API usage. Using CALM-IT, we train three models CALM 8B, CALM 70B, and CALM 405B, which outperform top domain-specific models, including GPT-4o, across all three benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08820v1-abstract-full').style.display = 'none'; document.getElementById('2502.08820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08373">arXiv:2502.08373</a> <span> [<a href="https://arxiv.org/pdf/2502.08373">pdf</a>, <a href="https://arxiv.org/format/2502.08373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty Aware Human-machine Collaboration in Camouflaged Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyue Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kehan Wang</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+Y">Yuhang Ming</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yong Peng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiong Chen</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+W">Wanzeng Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08373v1-abstract-short" style="display: inline;"> Camouflaged Object Detection (COD), the task of identifying objects concealed within their environments, has seen rapid growth due to its wide range of practical applications. A key step toward developing trustworthy COD systems is the estimation and effective utilization of uncertainty. In this work, we propose a human-machine collaboration framework for classifying the presence of camouflaged ob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08373v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08373v1-abstract-full" style="display: none;"> Camouflaged Object Detection (COD), the task of identifying objects concealed within their environments, has seen rapid growth due to its wide range of practical applications. A key step toward developing trustworthy COD systems is the estimation and effective utilization of uncertainty. In this work, we propose a human-machine collaboration framework for classifying the presence of camouflaged objects, leveraging the complementary strengths of computer vision (CV) models and noninvasive brain-computer interfaces (BCIs). Our approach introduces a multiview backbone to estimate uncertainty in CV model predictions, utilizes this uncertainty during training to improve efficiency, and defers low-confidence cases to human evaluation via RSVP-based BCIs during testing for more reliable decision-making. We evaluated the framework in the CAMO dataset, achieving state-of-the-art results with an average improvement of 4.56\% in balanced accuracy (BA) and 3.66\% in the F1 score compared to existing methods. For the best-performing participants, the improvements reached 7.6\% in BA and 6.66\% in the F1 score. Analysis of the training process revealed a strong correlation between our confidence measures and precision, while an ablation study confirmed the effectiveness of the proposed training policy and the human-machine collaboration strategy. In general, this work reduces human cognitive load, improves system reliability, and provides a strong foundation for advancements in real-world COD applications and human-computer interaction. Our code and data are available at: https://github.com/ziyuey/Uncertainty-aware-human-machine-collaboration-in-camouflaged-object-identification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08373v1-abstract-full').style.display = 'none'; document.getElementById('2502.08373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08253">arXiv:2502.08253</a> <span> [<a href="https://arxiv.org/pdf/2502.08253">pdf</a>, <a href="https://arxiv.org/format/2502.08253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Oriented GPLVM: Expressiveness and Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zi Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhidi Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M+M">Michael Minyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Olmos%2C+P+M">Pablo M. Olmos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08253v1-abstract-short" style="display: inline;"> The multi-view Gaussian process latent variable model (MV-GPLVM) aims to learn a unified representation from multi-view data but is hindered by challenges such as limited kernel expressiveness and low computational efficiency. To overcome these issues, we first introduce a new duality between the spectral density and the kernel function. By modeling the spectral density with a bivariate Gaussian m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08253v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08253v1-abstract-full" style="display: none;"> The multi-view Gaussian process latent variable model (MV-GPLVM) aims to learn a unified representation from multi-view data but is hindered by challenges such as limited kernel expressiveness and low computational efficiency. To overcome these issues, we first introduce a new duality between the spectral density and the kernel function. By modeling the spectral density with a bivariate Gaussian mixture, we then derive a generic and expressive kernel termed Next-Gen Spectral Mixture (NG-SM) for MV-GPLVMs. To address the inherent computational inefficiency of the NG-SM kernel, we propose a random Fourier feature approximation. Combined with a tailored reparameterization trick, this approximation enables scalable variational inference for both the model and the unified latent representations. Numerical evaluations across a diverse range of multi-view datasets demonstrate that our proposed method consistently outperforms state-of-the-art models in learning meaningful latent representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08253v1-abstract-full').style.display = 'none'; document.getElementById('2502.08253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08182">arXiv:2502.08182</a> <span> [<a href="https://arxiv.org/pdf/2502.08182">pdf</a>, <a href="https://arxiv.org/format/2502.08182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Memory Offloading for Large Language Model Inference with Latency SLO Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenxiang Ma</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhisheng Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zehua Yang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tianhao Fu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaxun Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yingwei Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenlin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Diyu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08182v1-abstract-short" style="display: inline;"> Offloading large language models (LLMs) state to host memory during inference promises to reduce operational costs by supporting larger models, longer inputs, and larger batch sizes. However, the design of existing memory offloading mechanisms does not take latency service-level objectives (SLOs) into consideration. As a result, they either lead to frequent SLO violations or underutilize host memo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08182v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08182v1-abstract-full" style="display: none;"> Offloading large language models (LLMs) state to host memory during inference promises to reduce operational costs by supporting larger models, longer inputs, and larger batch sizes. However, the design of existing memory offloading mechanisms does not take latency service-level objectives (SLOs) into consideration. As a result, they either lead to frequent SLO violations or underutilize host memory, thereby incurring economic loss and thus defeating the purpose of memory offloading. This paper presents Select-N, a latency-SLO-aware memory offloading system for LLM serving. A key challenge in designing Select-N is to reconcile the tension between meeting SLOs and maximizing host memory usage. Select-N overcomes it by exploiting a unique characteristic of modern LLMs: during serving, the computation time of each decoder layer is deterministic. Leveraging this, Select-N introduces offloading interval, an internal tunable knob that captures the tradeoff between SLOs and host memory usage, thereby reducing the aforementioned challenge to pick an optimal offloading interval. With that, Select-N proposes a two-stage approach to automatically pick the offloading interval. The first stage is offline that generates the range of optimal offloading interval, while the second stage adjusts offloading interval at the granularity of inference iteration based on runtime hardware status. Our evaluation shows that Select-N consistently meets SLOs and improves the serving throughput over existing mechanisms by 1.85X due to maximizing the use of host memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08182v1-abstract-full').style.display = 'none'; document.getElementById('2502.08182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07937">arXiv:2502.07937</a> <span> [<a href="https://arxiv.org/pdf/2502.07937">pdf</a>, <a href="https://arxiv.org/format/2502.07937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Active Advantage-Aligned Online Reinforcement Learning with Offline Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuefeng Liu</a>, <a href="/search/cs?searchtype=author&query=Le%2C+H+T+C">Hung T. C. Le</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Stevens%2C+R">Rick Stevens</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Walter%2C+M+R">Matthew R. Walter</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07937v1-abstract-short" style="display: inline;"> Online reinforcement learning (RL) enhances policies through direct interactions with the environment, but faces challenges related to sample efficiency. In contrast, offline RL leverages extensive pre-collected data to learn policies, but often produces suboptimal results due to limited data coverage. Recent efforts have sought to integrate offline and online RL in order to harness the advantages… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07937v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07937v1-abstract-full" style="display: none;"> Online reinforcement learning (RL) enhances policies through direct interactions with the environment, but faces challenges related to sample efficiency. In contrast, offline RL leverages extensive pre-collected data to learn policies, but often produces suboptimal results due to limited data coverage. Recent efforts have sought to integrate offline and online RL in order to harness the advantages of both approaches. However, effectively combining online and offline RL remains challenging due to issues that include catastrophic forgetting, lack of robustness and sample efficiency. In an effort to address these challenges, we introduce A3 RL , a novel method that actively selects data from combined online and offline sources to optimize policy improvement. We provide theoretical guarantee that validates the effectiveness our active sampling strategy and conduct thorough empirical experiments showing that our method outperforms existing state-of-the-art online RL techniques that utilize offline data. Our code will be publicly available at: https://github.com/xuefeng-cs/A3RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07937v1-abstract-full').style.display = 'none'; document.getElementById('2502.07937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07870">arXiv:2502.07870</a> <span> [<a href="https://arxiv.org/pdf/2502.07870">pdf</a>, <a href="https://arxiv.org/format/2502.07870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TextAtlas5M: A Large-scale Dataset for Dense Text Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A+J">Alex Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+D">Dongxing Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weiming Han</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhuobai Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiqi Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07870v1-abstract-short" style="display: inline;"> Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07870v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07870v1-abstract-full" style="display: none;"> Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the generation of images containing long-form text remains a persistent challenge, largely due to the limitations of existing datasets, which often focus on shorter and simpler text. To address this gap, we introduce TextAtlas5M, a novel dataset specifically designed to evaluate long-text rendering in text-conditioned image generation. Our dataset consists of 5 million long-text generated and collected images across diverse data types, enabling comprehensive evaluation of large-scale generative models on long-text image generation. We further curate 3000 human-improved test set TextAtlasEval across 3 data domains, establishing one of the most extensive benchmarks for text-conditioned generation. Evaluations suggest that the TextAtlasEval benchmarks present significant challenges even for the most advanced proprietary models (e.g. GPT4o with DallE-3), while their open-source counterparts show an even larger performance gap. These evidences position TextAtlas5M as a valuable dataset for training and evaluating future-generation text-conditioned image generation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07870v1-abstract-full').style.display = 'none'; document.getElementById('2502.07870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 15 figures. Dataset Website: https://textatlas5m.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07527">arXiv:2502.07527</a> <span> [<a href="https://arxiv.org/pdf/2502.07527">pdf</a>, <a href="https://arxiv.org/format/2502.07527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NatureLM: Deciphering the Language of Nature for Scientific Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yingce Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chuan Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuan-Jyue Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zekun Guo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yeqi Bai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pan Deng</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yaosen Min</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziheng Lu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hongxia Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jielan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kehan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Q">Qizhi Pei</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07527v1-abstract-short" style="display: inline;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07527v1-abstract-full" style="display: none;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typically trained in isolation, lacking the ability to integrate across different scientific domains. Recognizing that entities within these domains can all be represented as sequences, which together form the "language of nature", we introduce Nature Language Model (briefly, NatureLM), a sequence-based science foundation model designed for scientific discovery. Pre-trained with data from multiple scientific domains, NatureLM offers a unified, versatile model that enables various applications including: (i) generating and optimizing small molecules, proteins, RNA, and materials using text instructions; (ii) cross-domain generation/design, such as protein-to-molecule and protein-to-RNA generation; and (iii) achieving state-of-the-art performance in tasks like SMILES-to-IUPAC translation and retrosynthesis on USPTO-50k. NatureLM offers a promising generalist approach for various scientific tasks, including drug discovery (hit generation/optimization, ADMET optimization, synthesis), novel material design, and the development of therapeutic proteins or nucleotides. We have developed NatureLM models in different sizes (1 billion, 8 billion, and 46.7 billion parameters) and observed a clear improvement in performance as the model size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'none'; document.getElementById('2502.07527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">81 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07350">arXiv:2502.07350</a> <span> [<a href="https://arxiv.org/pdf/2502.07350">pdf</a>, <a href="https://arxiv.org/format/2502.07350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KABB: Knowledge-Aware Bayesian Bandits for Dynamic Expert Coordination in Multi-Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jusheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zimeng Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yijia Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ningyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuojie Yang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jiawei Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keze Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07350v1-abstract-short" style="display: inline;"> As scaling large language models faces prohibitive costs, multi-agent systems emerge as a promising alternative, though challenged by static knowledge assumptions and coordination inefficiencies. We introduces Knowledge-Aware Bayesian Bandits (KABB), a novel framework that enhances multi-agent system coordination through semantic understanding and dynamic adaptation. The framework features three k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07350v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07350v1-abstract-full" style="display: none;"> As scaling large language models faces prohibitive costs, multi-agent systems emerge as a promising alternative, though challenged by static knowledge assumptions and coordination inefficiencies. We introduces Knowledge-Aware Bayesian Bandits (KABB), a novel framework that enhances multi-agent system coordination through semantic understanding and dynamic adaptation. The framework features three key innovations: a three-dimensional knowledge distance model for deep semantic understanding, a dual-adaptation mechanism for continuous expert optimization, and a knowledge-aware Thompson Sampling strategy for efficient expert selection. Extensive evaluation demonstrates KABB achieves an optimal cost-performance balance, maintaining high performance while keeping computational demands relatively low in multi-agent coordination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07350v1-abstract-full').style.display = 'none'; document.getElementById('2502.07350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07295">arXiv:2502.07295</a> <span> [<a href="https://arxiv.org/pdf/2502.07295">pdf</a>, <a href="https://arxiv.org/format/2502.07295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Treatment Effect Estimation for Exponential Family Outcomes using Neural Networks with Targeted Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zeqin Yang</a>, <a href="/search/cs?searchtype=author&query=Dan%2C+J">Jiayi Dan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jixing Xu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhichao Zou</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+P">Peng Zhen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiecheng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07295v1-abstract-short" style="display: inline;"> Neural Networks (NNs) have became a natural choice for treatment effect estimation due to their strong approximation capabilities. Nevertheless, how to design NN-based estimators with desirable properties, such as low bias and doubly robustness, still remains a significant challenge. A common approach to address this is targeted regularization, which modifies the objective function of NNs. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07295v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07295v1-abstract-full" style="display: none;"> Neural Networks (NNs) have became a natural choice for treatment effect estimation due to their strong approximation capabilities. Nevertheless, how to design NN-based estimators with desirable properties, such as low bias and doubly robustness, still remains a significant challenge. A common approach to address this is targeted regularization, which modifies the objective function of NNs. However, existing works on targeted regularization are limited to Gaussian-distributed outcomes, significantly restricting their applicability in real-world scenarios. In this work, we aim to bridge this blank by extending this framework to the boarder exponential family outcomes. Specifically, we first derive the von-Mises expansion of the Average Dose function of Canonical Functions (ADCF), which inspires us how to construct a doubly robust estimator with good properties. Based on this, we develop a NN-based estimator for ADCF by generalizing functional targeted regularization to exponential families, and provide the corresponding theoretical convergence rate. Extensive experimental results demonstrate the effectiveness of our proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07295v1-abstract-full').style.display = 'none'; document.getElementById('2502.07295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07237">arXiv:2502.07237</a> <span> [<a href="https://arxiv.org/pdf/2502.07237">pdf</a>, <a href="https://arxiv.org/format/2502.07237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> DrugImproverGPT: A Large Language Model for Drug Optimization with Fine-Tuning via Structured Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuefeng Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Songhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Foster%2C+I">Ian Foster</a>, <a href="/search/cs?searchtype=author&query=Stevens%2C+R">Rick Stevens</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07237v1-abstract-short" style="display: inline;"> Finetuning a Large Language Model (LLM) is crucial for generating results towards specific objectives. This research delves into the realm of drug optimization and introduce a novel reinforcement learning algorithm to finetune a drug optimization LLM-based generative model, enhancing the original drug across target objectives, while retains the beneficial chemical properties of the original drug.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07237v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07237v1-abstract-full" style="display: none;"> Finetuning a Large Language Model (LLM) is crucial for generating results towards specific objectives. This research delves into the realm of drug optimization and introduce a novel reinforcement learning algorithm to finetune a drug optimization LLM-based generative model, enhancing the original drug across target objectives, while retains the beneficial chemical properties of the original drug. This work is comprised of two primary components: (1) DrugImprover: A framework tailored for improving robustness and efficiency in drug optimization. It includes a LLM designed for drug optimization and a novel Structured Policy Optimization (SPO) algorithm, which is theoretically grounded. This algorithm offers a unique perspective for fine-tuning the LLM-based generative model by aligning the improvement of the generated molecule with the input molecule under desired objectives. (2) A dataset of 1 million compounds, each with OEDOCK docking scores on 5 human proteins associated with cancer cells and 24 binding sites from SARS-CoV-2 virus. We conduct a comprehensive evaluation of SPO and demonstrate its effectiveness in improving the original drug across target properties. Our code and dataset will be publicly available at: https://github.com/xuefeng-cs/DrugImproverGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07237v1-abstract-full').style.display = 'none'; document.getElementById('2502.07237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06756">arXiv:2502.06756</a> <span> [<a href="https://arxiv.org/pdf/2502.06756">pdf</a>, <a href="https://arxiv.org/format/2502.06756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAMRefiner: Taming Segment Anything Model for Universal Mask Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuqi Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hengjia Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zheng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06756v1-abstract-short" style="display: inline;"> In this paper, we explore a principal way to enhance the quality of widely pre-existing coarse masks, enabling them to serve as reliable training data for segmentation models to reduce the annotation cost. In contrast to prior refinement techniques that are tailored to specific models or tasks in a close-world manner, we propose SAMRefiner, a universal and efficient approach by adapting SAM to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06756v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06756v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06756v1-abstract-full" style="display: none;"> In this paper, we explore a principal way to enhance the quality of widely pre-existing coarse masks, enabling them to serve as reliable training data for segmentation models to reduce the annotation cost. In contrast to prior refinement techniques that are tailored to specific models or tasks in a close-world manner, we propose SAMRefiner, a universal and efficient approach by adapting SAM to the mask refinement task. The core technique of our model is the noise-tolerant prompting scheme. Specifically, we introduce a multi-prompt excavation strategy to mine diverse input prompts for SAM (i.e., distance-guided points, context-aware elastic bounding boxes, and Gaussian-style masks) from initial coarse masks. These prompts can collaborate with each other to mitigate the effect of defects in coarse masks. In particular, considering the difficulty of SAM to handle the multi-object case in semantic segmentation, we introduce a split-then-merge (STM) pipeline. Additionally, we extend our method to SAMRefiner++ by introducing an additional IoU adaption step to further boost the performance of the generic SAMRefiner on the target dataset. This step is self-boosted and requires no additional annotation. The proposed framework is versatile and can flexibly cooperate with existing segmentation methods. We evaluate our mask framework on a wide range of benchmarks under different settings, demonstrating better accuracy and efficiency. SAMRefiner holds significant potential to expedite the evolution of refinement tools. Our code is available at https://github.com/linyq2117/SAMRefiner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06756v1-abstract-full').style.display = 'none'; document.getElementById('2502.06756v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06615">arXiv:2502.06615</a> <span> [<a href="https://arxiv.org/pdf/2502.06615">pdf</a>, <a href="https://arxiv.org/format/2502.06615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Scale Feature Fusion with Image-Driven Spatial Integration for Left Atrium Segmentation from Cardiac MRI Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kundu%2C+B">Bipasha Kundu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zixin Yang</a>, <a href="/search/cs?searchtype=author&query=Simon%2C+R">Richard Simon</a>, <a href="/search/cs?searchtype=author&query=Linte%2C+C">Cristian Linte</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06615v1-abstract-short" style="display: inline;"> Accurate segmentation of the left atrium (LA) from late gadolinium-enhanced magnetic resonance imaging plays a vital role in visualizing diseased atrial structures, enabling the diagnosis and management of cardiovascular diseases. It is particularly essential for planning treatment with ablation therapy, a key intervention for atrial fibrillation (AF). However, manual segmentation is time-intensiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06615v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06615v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06615v1-abstract-full" style="display: none;"> Accurate segmentation of the left atrium (LA) from late gadolinium-enhanced magnetic resonance imaging plays a vital role in visualizing diseased atrial structures, enabling the diagnosis and management of cardiovascular diseases. It is particularly essential for planning treatment with ablation therapy, a key intervention for atrial fibrillation (AF). However, manual segmentation is time-intensive and prone to inter-observer variability, underscoring the need for automated solutions. Class-agnostic foundation models like DINOv2 have demonstrated remarkable feature extraction capabilities in vision tasks. However, their lack of domain specificity and task-specific adaptation can reduce spatial resolution during feature extraction, impacting the capture of fine anatomical detail in medical imaging. To address this limitation, we propose a segmentation framework that integrates DINOv2 as an encoder with a UNet-style decoder, incorporating multi-scale feature fusion and input image integration to enhance segmentation accuracy. The learnable weighting mechanism dynamically prioritizes hierarchical features from different encoder blocks of the foundation model, optimizing feature selection for task relevance. Additionally, the input image is reintroduced during the decoding stage to preserve high-resolution spatial details, addressing limitations of downsampling in the encoder. We validate our approach on the LAScarQS 2022 dataset and demonstrate improved performance with a 92.3% Dice and 84.1% IoU score for giant architecture compared to the nnUNet baseline model. These findings emphasize the efficacy of our approach in advancing the field of automated left atrium segmentation from cardiac MRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06615v1-abstract-full').style.display = 'none'; document.getElementById('2502.06615v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06527">arXiv:2502.06527</a> <span> [<a href="https://arxiv.org/pdf/2502.06527">pdf</a>, <a href="https://arxiv.org/format/2502.06527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CustomVideoX: 3D Reference Attention Driven Dynamic Adaptation for Zero-Shot Customized Video Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=She%2C+D">D. She</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mushui Liu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jingxuan Pang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanggui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guanghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qihan Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haobin Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yunlong Yu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+S">Siming Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06527v1-abstract-short" style="display: inline;"> Customized generation has achieved significant progress in image synthesis, yet personalized video generation remains challenging due to temporal inconsistencies and quality degradation. In this paper, we introduce CustomVideoX, an innovative framework leveraging the video diffusion transformer for personalized video generation from a reference image. CustomVideoX capitalizes on pre-trained video… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06527v1-abstract-full" style="display: none;"> Customized generation has achieved significant progress in image synthesis, yet personalized video generation remains challenging due to temporal inconsistencies and quality degradation. In this paper, we introduce CustomVideoX, an innovative framework leveraging the video diffusion transformer for personalized video generation from a reference image. CustomVideoX capitalizes on pre-trained video networks by exclusively training the LoRA parameters to extract reference features, ensuring both efficiency and adaptability. To facilitate seamless interaction between the reference image and video content, we propose 3D Reference Attention, which enables direct and simultaneous engagement of reference image features with all video frames across spatial and temporal dimensions. To mitigate the excessive influence of reference image features and textual guidance on generated video content during inference, we implement the Time-Aware Reference Attention Bias (TAB) strategy, dynamically modulating reference bias over different time steps. Additionally, we introduce the Entity Region-Aware Enhancement (ERAE) module, aligning highly activated regions of key entity tokens with reference feature injection by adjusting attention bias. To thoroughly evaluate personalized video generation, we establish a new benchmark, VideoBench, comprising over 50 objects and 100 prompts for extensive assessment. Experimental results show that CustomVideoX significantly outperforms existing methods in terms of video consistency and quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06527v1-abstract-full').style.display = 'none'; document.getElementById('2502.06527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06474">arXiv:2502.06474</a> <span> [<a href="https://arxiv.org/pdf/2502.06474">pdf</a>, <a href="https://arxiv.org/format/2502.06474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniMoD: Efficient Unified Multimodal Transformers with Mixture-of-Depths </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+W">Weijia Mao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06474v1-abstract-short" style="display: inline;"> Unified multimodal transformers, which handle both generation and understanding tasks within a shared parameter space, have received increasing attention in recent research. Although various unified transformers have been proposed, training these models is costly due to redundant tokens and heavy attention computation. In the past, studies on large language models have demonstrated that token prun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06474v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06474v1-abstract-full" style="display: none;"> Unified multimodal transformers, which handle both generation and understanding tasks within a shared parameter space, have received increasing attention in recent research. Although various unified transformers have been proposed, training these models is costly due to redundant tokens and heavy attention computation. In the past, studies on large language models have demonstrated that token pruning methods, such as Mixture of Depths (MoD), can significantly improve computational efficiency. MoD employs a router to select the most important ones for processing within a transformer layer. However, directly applying MoD-based token pruning to unified transformers will result in suboptimal performance because different tasks exhibit varying levels of token redundancy. In our work, we analyze the unified transformers by (1) examining attention weight patterns, (2) evaluating the layer importance and token redundancy, and (3) analyzing task interactions. Our findings reveal that token redundancy is primarily influenced by different tasks and layers. Building on these findings, we introduce UniMoD, a task-aware token pruning method that employs a separate router for each task to determine which tokens should be pruned. We apply our method to Show-o and Emu3, reducing training FLOPs by approximately 15% in Show-o and 40% in Emu3, while maintaining or improving performance on several benchmarks. Code will be released at https://github.com/showlab/UniMoD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06474v1-abstract-full').style.display = 'none'; document.getElementById('2502.06474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06337">arXiv:2502.06337</a> <span> [<a href="https://arxiv.org/pdf/2502.06337">pdf</a>, <a href="https://arxiv.org/format/2502.06337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Outlier-robust Rotation Estimation by Stereographic Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+T">Taosi Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinlong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianbo Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhi-Xin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06337v1-abstract-short" style="display: inline;"> Rotation estimation plays a fundamental role in many computer vision and robot tasks. However, efficiently estimating rotation in large inputs containing numerous outliers (i.e., mismatches) and noise is a recognized challenge. Many robust rotation estimation methods have been designed to address this challenge. Unfortunately, existing methods are often inapplicable due to their long computation t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06337v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06337v1-abstract-full" style="display: none;"> Rotation estimation plays a fundamental role in many computer vision and robot tasks. However, efficiently estimating rotation in large inputs containing numerous outliers (i.e., mismatches) and noise is a recognized challenge. Many robust rotation estimation methods have been designed to address this challenge. Unfortunately, existing methods are often inapplicable due to their long computation time and the risk of local optima. In this paper, we propose an efficient and robust rotation estimation method. Specifically, our method first investigates geometric constraints involving only the rotation axis. Then, it uses stereographic projection and spatial voting techniques to identify the rotation axis and angle. Furthermore, our method efficiently obtains the optimal rotation estimation and can estimate multiple rotations simultaneously. To verify the feasibility of our method, we conduct comparative experiments using both synthetic and real-world data. The results show that, with GPU assistance, our method can solve large-scale ($10^6$ points) and severely corrupted (90\% outlier rate) rotation estimation problems within 0.07 seconds, with an angular error of only 0.01 degrees, which is superior to existing methods in terms of accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06337v1-abstract-full').style.display = 'none'; document.getElementById('2502.06337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06324">arXiv:2502.06324</a> <span> [<a href="https://arxiv.org/pdf/2502.06324">pdf</a>, <a href="https://arxiv.org/format/2502.06324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UniDemoir茅: Towards Universal Image Demoir茅ing with Data Generation and Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zemin Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yujing Sun</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xidong Peng</a>, <a href="/search/cs?searchtype=author&query=Yiu%2C+S+M">Siu Ming Yiu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuexin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06324v1-abstract-short" style="display: inline;"> Image demoir茅ing poses one of the most formidable challenges in image restoration, primarily due to the unpredictable and anisotropic nature of moir茅 patterns. Limited by the quantity and diversity of training data, current methods tend to overfit to a single moir茅 domain, resulting in performance degradation for new domains and restricting their robustness in real-world applications. In this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06324v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06324v1-abstract-full" style="display: none;"> Image demoir茅ing poses one of the most formidable challenges in image restoration, primarily due to the unpredictable and anisotropic nature of moir茅 patterns. Limited by the quantity and diversity of training data, current methods tend to overfit to a single moir茅 domain, resulting in performance degradation for new domains and restricting their robustness in real-world applications. In this paper, we propose a universal image demoir茅ing solution, UniDemoir茅, which has superior generalization capability. Notably, we propose innovative and effective data generation and synthesis methods that can automatically provide vast high-quality moir茅 images to train a universal demoir茅ing model. Our extensive experiments demonstrate the cutting-edge performance and broad potential of our approach for generalized image demoir茅ing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06324v1-abstract-full').style.display = 'none'; document.getElementById('2502.06324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05874">arXiv:2502.05874</a> <span> [<a href="https://arxiv.org/pdf/2502.05874">pdf</a>, <a href="https://arxiv.org/format/2502.05874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhifei Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keyang Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jiaxing Qi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+R">Ruifei Ma</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shenglin Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+M">Mingzhe Xing</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+J">Jieyi Long</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangde Liu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05874v1-abstract-short" style="display: inline;"> Controllable 3D scene generation has extensive applications in virtual reality and interior design, where the generated scenes should exhibit high levels of realism and controllability in terms of geometry. Scene graphs provide a suitable data representation that facilitates these applications. However, current graph-based methods for scene generation are constrained to text-based inputs and exhib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05874v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05874v1-abstract-full" style="display: none;"> Controllable 3D scene generation has extensive applications in virtual reality and interior design, where the generated scenes should exhibit high levels of realism and controllability in terms of geometry. Scene graphs provide a suitable data representation that facilitates these applications. However, current graph-based methods for scene generation are constrained to text-based inputs and exhibit insufficient adaptability to flexible user inputs, hindering the ability to precisely control object geometry. To address this issue, we propose MMGDreamer, a dual-branch diffusion model for scene generation that incorporates a novel Mixed-Modality Graph, visual enhancement module, and relation predictor. The mixed-modality graph allows object nodes to integrate textual and visual modalities, with optional relationships between nodes. It enhances adaptability to flexible user inputs and enables meticulous control over the geometry of objects in the generated scenes. The visual enhancement module enriches the visual fidelity of text-only nodes by constructing visual representations using text embeddings. Furthermore, our relation predictor leverages node representations to infer absent relationships between nodes, resulting in more coherent scene layouts. Extensive experimental results demonstrate that MMGDreamer exhibits superior control of object geometry, achieving state-of-the-art scene generation performance. Project page: https://yangzhifeio.github.io/project/MMGDreamer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05874v1-abstract-full').style.display = 'none'; document.getElementById('2502.05874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025 Main Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05740">arXiv:2502.05740</a> <span> [<a href="https://arxiv.org/pdf/2502.05740">pdf</a>, <a href="https://arxiv.org/format/2502.05740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RECOVER: Designing a Large Language Model-based Remote Patient Monitoring System for Postoperative Gastrointestinal Cancer Care </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziqi Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuxuan Lu</a>, <a href="/search/cs?searchtype=author&query=Bagdasarian%2C+J">Jennifer Bagdasarian</a>, <a href="/search/cs?searchtype=author&query=Swain%2C+V+D">Vedant Das Swain</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+R">Ritu Agarwal</a>, <a href="/search/cs?searchtype=author&query=Campbell%2C+C">Collin Campbell</a>, <a href="/search/cs?searchtype=author&query=Al-Refaire%2C+W">Waddah Al-Refaire</a>, <a href="/search/cs?searchtype=author&query=El-Bayoumi%2C+J">Jehan El-Bayoumi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+G">Guodong Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dakuo Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bingsheng Yao</a>, <a href="/search/cs?searchtype=author&query=Shara%2C+N">Nawar Shara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05740v1-abstract-short" style="display: inline;"> Cancer surgery is a key treatment for gastrointestinal (GI) cancers, a group of cancers that account for more than 35% of cancer-related deaths worldwide, but postoperative complications are unpredictable and can be life-threatening. In this paper, we investigate how recent advancements in large language models (LLMs) can benefit remote patient monitoring (RPM) systems through clinical integration… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05740v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05740v1-abstract-full" style="display: none;"> Cancer surgery is a key treatment for gastrointestinal (GI) cancers, a group of cancers that account for more than 35% of cancer-related deaths worldwide, but postoperative complications are unpredictable and can be life-threatening. In this paper, we investigate how recent advancements in large language models (LLMs) can benefit remote patient monitoring (RPM) systems through clinical integration by designing RECOVER, an LLM-powered RPM system for postoperative GI cancer care. To closely engage stakeholders in the design process, we first conducted seven participatory design sessions with five clinical staff and interviewed five cancer patients to derive six major design strategies for integrating clinical guidelines and information needs into LLM-based RPM systems. We then designed and implemented RECOVER, which features an LLM-powered conversational agent for cancer patients and an interactive dashboard for clinical staff to enable efficient postoperative RPM. Finally, we used RECOVER as a pilot system to assess the implementation of our design strategies with four clinical staff and five patients, providing design implications by identifying crucial design elements, offering insights on responsible AI, and outlining opportunities for future LLM-powered RPM systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05740v1-abstract-full').style.display = 'none'; document.getElementById('2502.05740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05561">arXiv:2502.05561</a> <span> [<a href="https://arxiv.org/pdf/2502.05561">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Model for Interest Refinement in Multi-Interest Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+Y">Yankun Le</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+B">Baoyuan Ou</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yingjie Qin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhixuan Yang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+R">Ruilong Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05561v2-abstract-short" style="display: inline;"> Multi-interest candidate matching plays a pivotal role in personalized recommender systems, as it captures diverse user interests from their historical behaviors. Most existing methods utilize attention mechanisms to generate interest representations by aggregating historical item embeddings. However, these methods only capture overall item-level relevance, leading to coarse-grained interest repre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05561v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05561v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05561v2-abstract-full" style="display: none;"> Multi-interest candidate matching plays a pivotal role in personalized recommender systems, as it captures diverse user interests from their historical behaviors. Most existing methods utilize attention mechanisms to generate interest representations by aggregating historical item embeddings. However, these methods only capture overall item-level relevance, leading to coarse-grained interest representations that include irrelevant information. To address this issue, we propose the Diffusion Multi-Interest model (DMI), a novel framework for refining user interest representations at the dimension level. Specifically, DMI first introduces controllable noise into coarse-grained interest representations at the dimensional level. Then, in the iterative reconstruction process, DMI combines a cross-attention mechanism and an item pruning strategy to reconstruct the personalized interest vectors with the guidance of tailored collaborative information. Extensive experiments demonstrate the effectiveness of DMI, surpassing state-of-the-art methods on offline evaluations and an online A/B test. Successfully deployed in the real-world recommender system, DMI effectively enhances user satisfaction and system performance at scale, serving the major traffic of hundreds of millions of daily active users. \footnote{The code will be released for reproducibility once the paper is accepted.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05561v2-abstract-full').style.display = 'none'; document.getElementById('2502.05561v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05434">arXiv:2502.05434</a> <span> [<a href="https://arxiv.org/pdf/2502.05434">pdf</a>, <a href="https://arxiv.org/format/2502.05434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sample-Efficient Reinforcement Learning from Human Feedback via Information-Directed Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+H">Han Qi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haochen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiaosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05434v2-abstract-short" style="display: inline;"> We study the problem of reinforcement learning from human feedback (RLHF), a critical problem in training large language models, from a theoretical perspective. Our main contribution is the design of novel sample-efficient RLHF algorithms based on information-directed sampling (IDS), an online decision-making principle inspired by information theory. Our algorithms maximize the sum of the value fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05434v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05434v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05434v2-abstract-full" style="display: none;"> We study the problem of reinforcement learning from human feedback (RLHF), a critical problem in training large language models, from a theoretical perspective. Our main contribution is the design of novel sample-efficient RLHF algorithms based on information-directed sampling (IDS), an online decision-making principle inspired by information theory. Our algorithms maximize the sum of the value function and a mutual information term that encourages exploration of the unknown environment (which quantifies the information gained about the environment through observed human feedback data). To tackle the challenge of large state spaces and improve sample efficiency, we construct a simplified \emph{surrogate environment} and introduce a novel distance measure (named the \emph{$\ell_g$-distance}), enabling our IDS-based algorithm to achieve a Bayesian regret upper bound of order $O(H^{\frac{3}{2}}\sqrt{\log(K(蔚)) T})$, where $H$ is the episode length, $T$ is the number of episode and $K(蔚)$ is related to the covering number of the environment. Specializing to the tabular settings, this regret bound is of order $\tilde{O}(H^2\sqrt{SAT})$, where $S$ and $A$ are the numbers of states and actions. Finally, we propose an Approximate-IDS algorithm that is computationally more efficient while maintaining nearly the same sample efficiency. The design principle of this approximate algorithm is not only effective in RLHF settings but also applicable to the standard RL framework. Moreover, our work showcases the value of information theory in reinforcement learning and in the training of large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05434v2-abstract-full').style.display = 'none'; document.getElementById('2502.05434v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05390">arXiv:2502.05390</a> <span> [<a href="https://arxiv.org/pdf/2502.05390">pdf</a>, <a href="https://arxiv.org/format/2502.05390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Task Representations from In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saglam%2C+B">Baturay Saglam</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Kalogerias%2C+D">Dionysis Kalogerias</a>, <a href="/search/cs?searchtype=author&query=Karbasi%2C+A">Amin Karbasi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05390v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable proficiency in in-context learning (ICL), where models adapt to new tasks through example-based prompts without requiring parameter updates. However, understanding how tasks are internally encoded and generalized remains a challenge. To address some of the empirical and technical gaps in the literature, we introduce an automated formulation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05390v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05390v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable proficiency in in-context learning (ICL), where models adapt to new tasks through example-based prompts without requiring parameter updates. However, understanding how tasks are internally encoded and generalized remains a challenge. To address some of the empirical and technical gaps in the literature, we introduce an automated formulation for encoding task information in ICL prompts as a function of attention heads within the transformer architecture. This approach computes a single task vector as a weighted sum of attention heads, with the weights optimized causally via gradient descent. Our findings show that existing methods fail to generalize effectively to modalities beyond text. In response, we also design a benchmark to evaluate whether a task vector can preserve task fidelity in functional regression tasks. The proposed method successfully extracts task-specific information from in-context demonstrations and excels in both text and regression tasks, demonstrating its generalizability across modalities. Moreover, ablation studies show that our method's effectiveness stems from aligning the distribution of the last hidden state with that of an optimally performing in-context-learned model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05390v1-abstract-full').style.display = 'none'; document.getElementById('2502.05390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appeared in ICML 2024 Workshop on In-Context Learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05237">arXiv:2502.05237</a> <span> [<a href="https://arxiv.org/pdf/2502.05237">pdf</a>, <a href="https://arxiv.org/format/2502.05237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PSM-SQL: Progressive Schema Learning with Multi-granularity Semantics for Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuopan Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuanzhen Xie</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+R">Ruichao Zhong</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yunzhi Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+E">Enjie Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenguo Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mochi Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bo Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05237v1-abstract-short" style="display: inline;"> It is challenging to convert natural language (NL) questions into executable structured query language (SQL) queries for text-to-SQL tasks due to the vast number of database schemas with redundancy, which interferes with semantic learning, and the domain shift between NL and SQL. Existing works for schema linking focus on the table level and perform it once, ignoring the multi-granularity semantic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05237v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05237v1-abstract-full" style="display: none;"> It is challenging to convert natural language (NL) questions into executable structured query language (SQL) queries for text-to-SQL tasks due to the vast number of database schemas with redundancy, which interferes with semantic learning, and the domain shift between NL and SQL. Existing works for schema linking focus on the table level and perform it once, ignoring the multi-granularity semantics and chainable cyclicity of schemas. In this paper, we propose a progressive schema linking with multi-granularity semantics (PSM-SQL) framework to reduce the redundant database schemas for text-to-SQL. Using the multi-granularity schema linking (MSL) module, PSM-SQL learns the schema semantics at the column, table, and database levels. More specifically, a triplet loss is used at the column level to learn embeddings, while fine-tuning LLMs is employed at the database level for schema reasoning. MSL employs classifier and similarity scores to model schema interactions for schema linking at the table level. In particular, PSM-SQL adopts a chain loop strategy to reduce the task difficulty of schema linking by continuously reducing the number of redundant schemas. Experiments conducted on text-to-SQL datasets show that the proposed PSM-SQL is 1-3 percentage points higher than the existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05237v1-abstract-full').style.display = 'none'; document.getElementById('2502.05237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures, submission in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05142">arXiv:2502.05142</a> <span> [<a href="https://arxiv.org/pdf/2502.05142">pdf</a>, <a href="https://arxiv.org/format/2502.05142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Chest X-ray Foundation Model with Global and Local Representations Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zefan Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuanang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Kalra%2C+M+K">Mannudeep K. Kalra</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pingkun Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05142v1-abstract-short" style="display: inline;"> Chest X-ray (CXR) is the most frequently ordered imaging test, supporting diverse clinical tasks from thoracic disease detection to postoperative monitoring. However, task-specific classification models are limited in scope, require costly labeled data, and lack generalizability to out-of-distribution datasets. To address these challenges, we introduce CheXFound, a self-supervised vision foundatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05142v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05142v1-abstract-full" style="display: none;"> Chest X-ray (CXR) is the most frequently ordered imaging test, supporting diverse clinical tasks from thoracic disease detection to postoperative monitoring. However, task-specific classification models are limited in scope, require costly labeled data, and lack generalizability to out-of-distribution datasets. To address these challenges, we introduce CheXFound, a self-supervised vision foundation model that learns robust CXR representations and generalizes effectively across a wide range of downstream tasks. We pretrain CheXFound on a curated CXR-1M dataset, comprising over one million unique CXRs from publicly available sources. We propose a Global and Local Representations Integration (GLoRI) module for downstream adaptations, by incorporating disease-specific local features with global image features for enhanced performance in multilabel classification. Our experimental results show that CheXFound outperforms state-of-the-art models in classifying 40 disease findings across different prevalence levels on the CXR-LT 24 dataset and exhibits superior label efficiency on downstream tasks with limited training data. Additionally, CheXFound achieved significant improvements on new tasks with out-of-distribution datasets, including opportunistic cardiovascular disease risk estimation and mortality prediction. These results highlight CheXFound's strong generalization capabilities, enabling diverse adaptations with improved label efficiency. The project source code is publicly available at https://github.com/RPIDIAL/CheXFound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05142v1-abstract-full').style.display = 'none'; document.getElementById('2502.05142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05116">arXiv:2502.05116</a> <span> [<a href="https://arxiv.org/pdf/2502.05116">pdf</a>, <a href="https://arxiv.org/format/2502.05116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Wireless Resource Management and Synchronization in Digital Twin Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hanzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuchen Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhaohui Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haijian Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingzhe Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05116v1-abstract-short" style="display: inline;"> In this paper, we investigate an accurate synchronization between a physical network and its digital network twin (DNT), which serves as a virtual representation of the physical network. The considered network includes a set of base stations (BSs) that must allocate its limited spectrum resources to serve a set of users while also transmitting its partially observed physical network information to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05116v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05116v1-abstract-full" style="display: none;"> In this paper, we investigate an accurate synchronization between a physical network and its digital network twin (DNT), which serves as a virtual representation of the physical network. The considered network includes a set of base stations (BSs) that must allocate its limited spectrum resources to serve a set of users while also transmitting its partially observed physical network information to a cloud server to generate the DNT. Since the DNT can predict the physical network status based on its historical status, the BSs may not need to send their physical network information at each time slot, allowing them to conserve spectrum resources to serve the users. However, if the DNT does not receive the physical network information of the BSs over a large time period, the DNT's accuracy in representing the physical network may degrade. To this end, each BS must decide when to send the physical network information to the cloud server to update the DNT, while also determining the spectrum resource allocation policy for both DNT synchronization and serving the users. We formulate this resource allocation task as an optimization problem, aiming to maximize the total data rate of all users while minimizing the asynchronization between the physical network and the DNT. To address this problem, we propose a method based on the GRUs and the value decomposition network (VDN). Simulation results show that our GRU and VDN based algorithm improves the weighted sum of data rates and the similarity between the status of the DNT and the physical network by up to 28.96%, compared to a baseline method combining GRU with the independent Q learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05116v1-abstract-full').style.display = 'none'; document.getElementById('2502.05116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04656">arXiv:2502.04656</a> <span> [<a href="https://arxiv.org/pdf/2502.04656">pdf</a>, <a href="https://arxiv.org/format/2502.04656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MHAF-YOLO: Multi-Branch Heterogeneous Auxiliary Fusion YOLO for accurate object detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiqiang Yang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Q">Qiu Guan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhongwen Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinli Xu</a>, <a href="/search/cs?searchtype=author&query=Long%2C+H">Haixia Long</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+S">Sheng Lian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haigen Hu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Ying Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04656v1-abstract-short" style="display: inline;"> Due to the effective multi-scale feature fusion capabilities of the Path Aggregation FPN (PAFPN), it has become a widely adopted component in YOLO-based detectors. However, PAFPN struggles to integrate high-level semantic cues with low-level spatial details, limiting its performance in real-world applications, especially with significant scale variations. In this paper, we propose MHAF-YOLO, a nov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04656v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04656v1-abstract-full" style="display: none;"> Due to the effective multi-scale feature fusion capabilities of the Path Aggregation FPN (PAFPN), it has become a widely adopted component in YOLO-based detectors. However, PAFPN struggles to integrate high-level semantic cues with low-level spatial details, limiting its performance in real-world applications, especially with significant scale variations. In this paper, we propose MHAF-YOLO, a novel detection framework featuring a versatile neck design called the Multi-Branch Auxiliary FPN (MAFPN), which consists of two key modules: the Superficial Assisted Fusion (SAF) and Advanced Assisted Fusion (AAF). The SAF bridges the backbone and the neck by fusing shallow features, effectively transferring crucial low-level spatial information with high fidelity. Meanwhile, the AAF integrates multi-scale feature information at deeper neck layers, delivering richer gradient information to the output layer and further enhancing the model learning capacity. To complement MAFPN, we introduce the Global Heterogeneous Flexible Kernel Selection (GHFKS) mechanism and the Reparameterized Heterogeneous Multi-Scale (RepHMS) module to enhance feature fusion. RepHMS is globally integrated into the network, utilizing GHFKS to select larger convolutional kernels for various feature layers, expanding the vertical receptive field and capturing contextual information across spatial hierarchies. Locally, it optimizes convolution by processing both large and small kernels within the same layer, broadening the lateral receptive field and preserving crucial details for detecting smaller targets. The source code of this work is available at: https://github.com/yang0201/MHAF-YOLO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04656v1-abstract-full').style.display = 'none'; document.getElementById('2502.04656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2407.04381</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04229">arXiv:2502.04229</a> <span> [<a href="https://arxiv.org/pdf/2502.04229">pdf</a>, <a href="https://arxiv.org/format/2502.04229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dark Distillation: Backdooring Distilled Datasets without Accessing Raw Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+T">Joey Tianyi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04229v1-abstract-short" style="display: inline;"> Dataset distillation (DD) enhances training efficiency and reduces bandwidth by condensing large datasets into smaller synthetic ones. It enables models to achieve performance comparable to those trained on the raw full dataset and has become a widely adopted method for data sharing. However, security concerns in DD remain underexplored. Existing studies typically assume that malicious behavior or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04229v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04229v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04229v1-abstract-full" style="display: none;"> Dataset distillation (DD) enhances training efficiency and reduces bandwidth by condensing large datasets into smaller synthetic ones. It enables models to achieve performance comparable to those trained on the raw full dataset and has become a widely adopted method for data sharing. However, security concerns in DD remain underexplored. Existing studies typically assume that malicious behavior originates from dataset owners during the initial distillation process, where backdoors are injected into raw datasets. In contrast, this work is the first to address a more realistic and concerning threat: attackers may intercept the dataset distribution process, inject backdoors into the distilled datasets, and redistribute them to users. While distilled datasets were previously considered resistant to backdoor attacks, we demonstrate that they remain vulnerable to such attacks. Furthermore, we show that attackers do not even require access to any raw data to inject the backdoors successfully. Specifically, our approach reconstructs conceptual archetypes for each class from the model trained on the distilled dataset. Backdoors are then injected into these archetypes to update the distilled dataset. Moreover, we ensure the updated dataset not only retains the backdoor but also preserves the original optimization trajectory, thus maintaining the knowledge of the raw dataset. To achieve this, a hybrid loss is designed to integrate backdoor information along the benign optimization trajectory, ensuring that previously learned information is not forgotten. Extensive experiments demonstrate that distilled datasets are highly vulnerable to backdoor attacks, with risks pervasive across various raw datasets, distillation methods, and downstream training strategies. Moreover, our attack method is efficient, capable of synthesizing a malicious distilled dataset in under one minute in certain cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04229v1-abstract-full').style.display = 'none'; document.getElementById('2502.04229v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04093">arXiv:2502.04093</a> <span> [<a href="https://arxiv.org/pdf/2502.04093">pdf</a>, <a href="https://arxiv.org/format/2502.04093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> PSZ: Enhancing the SZ Scientific Lossy Compressor With Progressive Data Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoxun Yang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+S">Sheng Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Ximiao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiajun Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyang Liu</a>, <a href="/search/cs?searchtype=author&query=Cappello%2C+F">Franck Cappello</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04093v2-abstract-short" style="display: inline;"> Compression is a crucial solution for data reduction in modern scientific applications due to the exponential growth of data from simulations, experiments, and observations. Compression with progressive retrieval capability allows users to access coarse approximations of data quickly and then incrementally refine these approximations to higher fidelity. Existing progressive compression solutions s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04093v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04093v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04093v2-abstract-full" style="display: none;"> Compression is a crucial solution for data reduction in modern scientific applications due to the exponential growth of data from simulations, experiments, and observations. Compression with progressive retrieval capability allows users to access coarse approximations of data quickly and then incrementally refine these approximations to higher fidelity. Existing progressive compression solutions suffer from low reduction ratios or high operation costs, effectively undermining the approach's benefits. In this paper, we propose the first-ever interpolation-based progressive lossy compression solution that has both high reduction ratios and low operation costs. The interpolation-based algorithm has been verified as one of the best for scientific data reduction, but previously no effort exists to make it support progressive retrieval. Our contributions are three-fold: (1) We thoroughly analyze the error characteristics of the interpolation algorithm and propose our solution IPComp with multi-level bitplane and predictive coding. (2) We derive optimized strategies toward minimum data retrieval under different fidelity levels indicated by users through error bounds and bitrates. (3) We evaluate the proposed solution using six real-world datasets from four diverse domains. Experimental results demonstrate our solution archives up to $487\%$ higher compression ratios and $698\%$ faster speed than other state-of-the-art progressive compressors, and reduces the data volume for retrieval by up to $83\%$ compared to baselines under the same error bound, and reduces the error by up to $99\%$ under the same bitrate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04093v2-abstract-full').style.display = 'none'; document.getElementById('2502.04093v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04078">arXiv:2502.04078</a> <span> [<a href="https://arxiv.org/pdf/2502.04078">pdf</a>, <a href="https://arxiv.org/format/2502.04078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> CDIO: Cross-Domain Inference Optimization with Resource Preference Prediction for Edge-Cloud Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zheming Yang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wen Ji</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dieli Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chang Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuanlei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yi Zhao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chaoyu Gong</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yang You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04078v1-abstract-short" style="display: inline;"> Currently, massive video tasks are processed by edge-cloud collaboration. However, the diversity of task requirements and the dynamics of resources pose great challenges to efficient inference, resulting in many wasted resources. In this paper, we present CDIO, a cross-domain inference optimization framework designed for edge-cloud collaboration. For diverse input tasks, CDIO can predict resource… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04078v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04078v1-abstract-full" style="display: none;"> Currently, massive video tasks are processed by edge-cloud collaboration. However, the diversity of task requirements and the dynamics of resources pose great challenges to efficient inference, resulting in many wasted resources. In this paper, we present CDIO, a cross-domain inference optimization framework designed for edge-cloud collaboration. For diverse input tasks, CDIO can predict resource preference types by analyzing spatial complexity and processing requirements of the task. Subsequently, a cross-domain collaborative optimization algorithm is employed to guide resource allocation in the edge-cloud system. By ensuring that each task is matched with the ideal servers, the edge-cloud system can achieve higher efficiency inference. The evaluation results on public datasets demonstrate that CDIO can effectively meet the accuracy and delay requirements for task processing. Compared to state-of-the-art edge-cloud solutions, CDIO achieves a computing and bandwidth consumption reduction of 20%-40%. And it can reduce energy consumption by more than 40%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04078v1-abstract-full').style.display = 'none'; document.getElementById('2502.04078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03825">arXiv:2502.03825</a> <span> [<a href="https://arxiv.org/pdf/2502.03825">pdf</a>, <a href="https://arxiv.org/format/2502.03825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Synthetic Poisoning Attacks: The Impact of Poisoned MRI Image on U-Net Brain Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianhao Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+T">Tianyu Zeng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chulong Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingyu Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haotian Huang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+C">Chuangxin Chu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+F">Fang-Fang Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03825v1-abstract-short" style="display: inline;"> Deep learning-based medical image segmentation models, such as U-Net, rely on high-quality annotated datasets to achieve accurate predictions. However, the increasing use of generative models for synthetic data augmentation introduces potential risks, particularly in the absence of rigorous quality control. In this paper, we investigate the impact of synthetic MRI data on the robustness and segmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03825v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03825v1-abstract-full" style="display: none;"> Deep learning-based medical image segmentation models, such as U-Net, rely on high-quality annotated datasets to achieve accurate predictions. However, the increasing use of generative models for synthetic data augmentation introduces potential risks, particularly in the absence of rigorous quality control. In this paper, we investigate the impact of synthetic MRI data on the robustness and segmentation accuracy of U-Net models for brain tumor segmentation. Specifically, we generate synthetic T1-contrast-enhanced (T1-Ce) MRI scans using a GAN-based model with a shared encoding-decoding framework and shortest-path regularization. To quantify the effect of synthetic data contamination, we train U-Net models on progressively "poisoned" datasets, where synthetic data proportions range from 16.67% to 83.33%. Experimental results on a real MRI validation set reveal a significant performance degradation as synthetic data increases, with Dice coefficients dropping from 0.8937 (33.33% synthetic) to 0.7474 (83.33% synthetic). Accuracy and sensitivity exhibit similar downward trends, demonstrating the detrimental effect of synthetic data on segmentation robustness. These findings underscore the importance of quality control in synthetic data integration and highlight the risks of unregulated synthetic augmentation in medical image analysis. Our study provides critical insights for the development of more reliable and trustworthy AI-driven medical imaging systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03825v1-abstract-full').style.display = 'none'; document.getElementById('2502.03825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02443">arXiv:2502.02443</a> <span> [<a href="https://arxiv.org/pdf/2502.02443">pdf</a>, <a href="https://arxiv.org/format/2502.02443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Null Space Compliance Approach for Maintaining Safety and Tracking Performance in Human-Robot Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zi-Qi Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Miaomiao Wang</a>, <a href="/search/cs?searchtype=author&query=Kermani%2C+M+R">Mehrdad R. Kermani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02443v1-abstract-short" style="display: inline;"> In recent years, the focus on developing robot manipulators has shifted towards prioritizing safety in Human-Robot Interaction (HRI). Impedance control is a typical approach for interaction control in collaboration tasks. However, such a control approach has two main limitations: 1) the end-effector (EE)'s limited compliance to adapt to unknown physical interactions, and 2) inability of the robot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02443v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02443v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02443v1-abstract-full" style="display: none;"> In recent years, the focus on developing robot manipulators has shifted towards prioritizing safety in Human-Robot Interaction (HRI). Impedance control is a typical approach for interaction control in collaboration tasks. However, such a control approach has two main limitations: 1) the end-effector (EE)'s limited compliance to adapt to unknown physical interactions, and 2) inability of the robot body to compliantly adapt to unknown physical interactions. In this work, we present an approach to address these drawbacks. We introduce a modified Cartesian impedance control method combined with a Dynamical System (DS)-based motion generator, aimed at enhancing the interaction capability of the EE without compromising main task tracking performance. This approach enables human coworkers to interact with the EE on-the-fly, e.g. tool changeover, after which the robot compliantly resumes its task. Additionally, combining with a new null space impedance control method enables the robot body to exhibit compliant behaviour in response to interactions, avoiding serious injuries from accidental contact while mitigating the impact on main task tracking performance. Finally, we prove the passivity of the system and validate the proposed approach through comprehensive comparative experiments on a 7 Degree-of-Freedom (DOF) KUKA LWR IV+ robot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02443v1-abstract-full').style.display = 'none'; document.getElementById('2502.02443v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01241">arXiv:2502.01241</a> <span> [<a href="https://arxiv.org/pdf/2502.01241">pdf</a>, <a href="https://arxiv.org/format/2502.01241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Peering Behind the Shield: Guardrail Identification in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziqing Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixin Wu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+R">Rui Wen</a>, <a href="/search/cs?searchtype=author&query=Backes%2C+M">Michael Backes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01241v1-abstract-short" style="display: inline;"> Human-AI conversations have gained increasing attention since the era of large language models. Consequently, more techniques, such as input/output guardrails and safety alignment, are proposed to prevent potential misuse of such Human-AI conversations. However, the ability to identify these guardrails has significant implications, both for adversarial exploitation and for auditing purposes by red… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01241v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01241v1-abstract-full" style="display: none;"> Human-AI conversations have gained increasing attention since the era of large language models. Consequently, more techniques, such as input/output guardrails and safety alignment, are proposed to prevent potential misuse of such Human-AI conversations. However, the ability to identify these guardrails has significant implications, both for adversarial exploitation and for auditing purposes by red team operators. In this work, we propose a novel method, AP-Test, which identifies the presence of a candidate guardrail by leveraging guardrail-specific adversarial prompts to query the AI agent. Extensive experiments of four candidate guardrails under diverse scenarios showcase the effectiveness of our method. The ablation study further illustrates the importance of the components we designed, such as the loss terms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01241v1-abstract-full').style.display = 'none'; document.getElementById('2502.01241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00859">arXiv:2502.00859</a> <span> [<a href="https://arxiv.org/pdf/2502.00859">pdf</a>, <a href="https://arxiv.org/format/2502.00859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedRIR: Rethinking Information Representation in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zerui Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zexin Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00859v1-abstract-short" style="display: inline;"> Mobile and Web-of-Things (WoT) devices at the network edge generate vast amounts of data for machine learning applications, yet privacy concerns hinder centralized model training. Federated Learning (FL) allows clients (devices) to collaboratively train a shared model coordinated by a central server without transfer private data, but inherent statistical heterogeneity among clients presents challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00859v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00859v1-abstract-full" style="display: none;"> Mobile and Web-of-Things (WoT) devices at the network edge generate vast amounts of data for machine learning applications, yet privacy concerns hinder centralized model training. Federated Learning (FL) allows clients (devices) to collaboratively train a shared model coordinated by a central server without transfer private data, but inherent statistical heterogeneity among clients presents challenges, often leading to a dilemma between clients' needs for personalized local models and the server's goal of building a generalized global model. Existing FL methods typically prioritize either global generalization or local personalization, resulting in a trade-off between these two objectives and limiting the full potential of diverse client data. To address this challenge, we propose a novel framework that simultaneously enhances global generalization and local personalization by Rethinking Information Representation in the Federated learning process (FedRIR). Specifically, we introduce Masked Client-Specific Learning (MCSL), which isolates and extracts fine-grained client-specific features tailored to each client's unique data characteristics, thereby enhancing personalization. Concurrently, the Information Distillation Module (IDM) refines the global shared features by filtering out redundant client-specific information, resulting in a purer and more robust global representation that enhances generalization. By integrating the refined global features with the isolated client-specific features, we construct enriched representations that effectively capture both global patterns and local nuances, thereby improving the performance of downstream tasks on the client. The code is available at https://github.com/Deep-Imaging-Group/FedRIR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00859v1-abstract-full').style.display = 'none'; document.getElementById('2502.00859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00808">arXiv:2502.00808</a> <span> [<a href="https://arxiv.org/pdf/2502.00808">pdf</a>, <a href="https://arxiv.org/format/2502.00808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Synthetic Artifact Auditing: Tracing LLM-Generated Synthetic Data Usage in Downstream Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixin Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziqing Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yun Shen</a>, <a href="/search/cs?searchtype=author&query=Backes%2C+M">Michael Backes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00808v1-abstract-short" style="display: inline;"> Large language models (LLMs) have facilitated the generation of high-quality, cost-effective synthetic data for developing downstream models and conducting statistical analyses in various domains. However, the increased reliance on synthetic data may pose potential negative impacts. Numerous studies have demonstrated that LLM-generated synthetic data can perpetuate and even amplify societal biases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00808v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00808v1-abstract-full" style="display: none;"> Large language models (LLMs) have facilitated the generation of high-quality, cost-effective synthetic data for developing downstream models and conducting statistical analyses in various domains. However, the increased reliance on synthetic data may pose potential negative impacts. Numerous studies have demonstrated that LLM-generated synthetic data can perpetuate and even amplify societal biases and stereotypes, and produce erroneous outputs known as ``hallucinations'' that deviate from factual knowledge. In this paper, we aim to audit artifacts, such as classifiers, generators, or statistical plots, to identify those trained on or derived from synthetic data and raise user awareness, thereby reducing unexpected consequences and risks in downstream applications. To this end, we take the first step to introduce synthetic artifact auditing to assess whether a given artifact is derived from LLM-generated synthetic data. We then propose an auditing framework with three methods including metric-based auditing, tuning-based auditing, and classification-based auditing. These methods operate without requiring the artifact owner to disclose proprietary training details. We evaluate our auditing framework on three text classification tasks, two text summarization tasks, and two data visualization tasks across three training scenarios. Our evaluation demonstrates the effectiveness of all proposed auditing methods across all these tasks. For instance, black-box metric-based auditing can achieve an average accuracy of $0.868 \pm 0.071$ for auditing classifiers and $0.880 \pm 0.052$ for auditing generators using only 200 random queries across three scenarios. We hope our research will enhance model transparency and regulatory compliance, ensuring the ethical and responsible use of synthetic data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00808v1-abstract-full').style.display = 'none'; document.getElementById('2502.00808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To Appear in the 34th USENIX Security Symposium, August 13-15, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19393">arXiv:2501.19393</a> <span> [<a href="https://arxiv.org/pdf/2501.19393">pdf</a>, <a href="https://arxiv.org/format/2501.19393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> s1: Simple test-time scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zitong Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X+L">Xiang Lisa Li</a>, <a href="/search/cs?searchtype=author&query=Fei-Fei%2C+L">Li Fei-Fei</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P">Percy Liang</a>, <a href="/search/cs?searchtype=author&query=Cand%C3%A8s%2C+E">Emmanuel Cand猫s</a>, <a href="/search/cs?searchtype=author&query=Hashimoto%2C+T">Tatsunori Hashimoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19393v2-abstract-short" style="display: inline;"> Test-time scaling is a promising new approach to language modeling that uses extra test-time compute to improve performance. Recently, OpenAI's o1 model showed this capability but did not publicly share its methodology, leading to many replication efforts. We seek the simplest approach to achieve test-time scaling and strong reasoning performance. First, we curate a small dataset s1K of 1,000 ques… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19393v2-abstract-full').style.display = 'inline'; document.getElementById('2501.19393v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19393v2-abstract-full" style="display: none;"> Test-time scaling is a promising new approach to language modeling that uses extra test-time compute to improve performance. Recently, OpenAI's o1 model showed this capability but did not publicly share its methodology, leading to many replication efforts. We seek the simplest approach to achieve test-time scaling and strong reasoning performance. First, we curate a small dataset s1K of 1,000 questions paired with reasoning traces relying on three criteria we validate through ablations: difficulty, diversity, and quality. Second, we develop budget forcing to control test-time compute by forcefully terminating the model's thinking process or lengthening it by appending "Wait" multiple times to the model's generation when it tries to end. This can lead the model to double-check its answer, often fixing incorrect reasoning steps. After supervised finetuning the Qwen2.5-32B-Instruct language model on s1K and equipping it with budget forcing, our model s1-32B exceeds o1-preview on competition math questions by up to 27% (MATH and AIME24). Further, scaling s1-32B with budget forcing allows extrapolating beyond its performance without test-time intervention: from 50% to 57% on AIME24. Our model, data, and code are open-source at https://github.com/simplescaling/s1 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19393v2-abstract-full').style.display = 'none'; document.getElementById('2501.19393v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">45 pages (9 main), 10 figures, 14 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18895">arXiv:2501.18895</a> <span> [<a href="https://arxiv.org/pdf/2501.18895">pdf</a>, <a href="https://arxiv.org/ps/2501.18895">ps</a>, <a href="https://arxiv.org/format/2501.18895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient Supernet Training with Orthogonal Softmax for Scalable ASR Model Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingjing Xu</a>, <a href="/search/cs?searchtype=author&query=Beck%2C+E">Eugen Beck</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zijian Yang</a>, <a href="/search/cs?searchtype=author&query=Schl%C3%BCter%2C+R">Ralf Schl眉ter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18895v2-abstract-short" style="display: inline;"> ASR systems are deployed across diverse environments, each with specific hardware constraints. We use supernet training to jointly train multiple encoders of varying sizes, enabling dynamic model size adjustment to fit hardware constraints without redundant training. Moreover, we introduce a novel method called OrthoSoftmax, which applies multiple orthogonal softmax functions to efficiently identi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18895v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18895v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18895v2-abstract-full" style="display: none;"> ASR systems are deployed across diverse environments, each with specific hardware constraints. We use supernet training to jointly train multiple encoders of varying sizes, enabling dynamic model size adjustment to fit hardware constraints without redundant training. Moreover, we introduce a novel method called OrthoSoftmax, which applies multiple orthogonal softmax functions to efficiently identify optimal subnets within the supernet, avoiding resource-intensive search. This approach also enables more flexible and precise subnet selection by allowing selection based on various criteria and levels of granularity. Our results with CTC on Librispeech and TED-LIUM-v2 show that FLOPs-aware component-wise selection achieves the best overall performance. With the same number of training updates from one single job, WERs for all model sizes are comparable to or slightly better than those of individually trained models. Furthermore, we analyze patterns in the selected components and reveal interesting insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18895v2-abstract-full').style.display = 'none'; document.getElementById('2501.18895v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18475">arXiv:2501.18475</a> <span> [<a href="https://arxiv.org/pdf/2501.18475">pdf</a>, <a href="https://arxiv.org/format/2501.18475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CLoQ: Enhancing Fine-Tuning of Quantized LLMs via Calibrated LoRA Initialization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yanxia Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aozhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Naigang Wang</a>, <a href="/search/cs?searchtype=author&query=Gurses%2C+S">Selcuk Gurses</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zi Yang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+P">Penghang Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18475v1-abstract-short" style="display: inline;"> Fine-tuning large language models (LLMs) using low-rank adaptation (LoRA) has become a highly efficient approach for downstream tasks, particularly in scenarios with limited computational resources. However, applying LoRA techniques to quantized LLMs poses unique challenges due to the reduced representational precision of quantized weights. In this paper, we introduce CLoQ (Calibrated LoRA initial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18475v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18475v1-abstract-full" style="display: none;"> Fine-tuning large language models (LLMs) using low-rank adaptation (LoRA) has become a highly efficient approach for downstream tasks, particularly in scenarios with limited computational resources. However, applying LoRA techniques to quantized LLMs poses unique challenges due to the reduced representational precision of quantized weights. In this paper, we introduce CLoQ (Calibrated LoRA initialization for Quantized LLMs), a simplistic initialization strategy designed to overcome these challenges. Our approach focuses on minimizing the layer-wise discrepancy between the original LLM and its quantized counterpart with LoRA components during initialization. By leveraging a small calibration dataset, CLoQ quantizes a pre-trained LLM and determines the optimal LoRA components for each layer, ensuring a strong foundation for subsequent fine-tuning. A key contribution of this work is a novel theoretical result that enables the accurate and closed-form construction of these optimal LoRA components. We validate the efficacy of CLoQ across multiple tasks such as language generation, arithmetic reasoning, and commonsense reasoning, demonstrating that it consistently outperforms existing LoRA fine-tuning methods for quantized LLMs, especially at ultra low-bit widths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18475v1-abstract-full').style.display = 'none'; document.getElementById('2501.18475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17615">arXiv:2501.17615</a> <span> [<a href="https://arxiv.org/pdf/2501.17615">pdf</a>, <a href="https://arxiv.org/format/2501.17615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Cross-lingual Embedding Clustering for Hierarchical Softmax in Low-Resource Multilingual Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengdong Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qianying Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sheng Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+F">Fei Cheng</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+C">Chenhui Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17615v1-abstract-short" style="display: inline;"> We present a novel approach centered on the decoding stage of Automatic Speech Recognition (ASR) that enhances multilingual performance, especially for low-resource languages. It utilizes a cross-lingual embedding clustering method to construct a hierarchical Softmax (H-Softmax) decoder, which enables similar tokens across different languages to share similar decoder representations. It addresses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17615v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17615v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17615v1-abstract-full" style="display: none;"> We present a novel approach centered on the decoding stage of Automatic Speech Recognition (ASR) that enhances multilingual performance, especially for low-resource languages. It utilizes a cross-lingual embedding clustering method to construct a hierarchical Softmax (H-Softmax) decoder, which enables similar tokens across different languages to share similar decoder representations. It addresses the limitations of the previous Huffman-based H-Softmax method, which relied on shallow features in token similarity assessments. Through experiments on a downsampled dataset of 15 languages, we demonstrate the effectiveness of our approach in improving low-resource multilingual ASR accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17615v1-abstract-full').style.display = 'none'; document.getElementById('2501.17615v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17116">arXiv:2501.17116</a> <span> [<a href="https://arxiv.org/pdf/2501.17116">pdf</a>, <a href="https://arxiv.org/format/2501.17116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Large Language Model Training Using FP4 Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruizhe Wang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guoshuai Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyue Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+B">Baining Guo</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zhengjun Zha</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17116v1-abstract-short" style="display: inline;"> The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17116v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17116v1-abstract-full" style="display: none;"> The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17116v1-abstract-full').style.display = 'none'; document.getElementById('2501.17116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15977">arXiv:2501.15977</a> <span> [<a href="https://arxiv.org/pdf/2501.15977">pdf</a>, <a href="https://arxiv.org/format/2501.15977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Classification Error Bound for Low Bayes Error Conditions in Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zijian Yang</a>, <a href="/search/cs?searchtype=author&query=Eminyan%2C+V">Vahe Eminyan</a>, <a href="/search/cs?searchtype=author&query=Schl%C3%BCter%2C+R">Ralf Schl眉ter</a>, <a href="/search/cs?searchtype=author&query=Ney%2C+H">Hermann Ney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15977v1-abstract-short" style="display: inline;"> In statistical classification and machine learning, classification error is an important performance measure, which is minimized by the Bayes decision rule. In practice, the unknown true distribution is usually replaced with a model distribution estimated from the training data in the Bayes decision rule. This substitution introduces a mismatch between the Bayes error and the model-based classific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15977v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15977v1-abstract-full" style="display: none;"> In statistical classification and machine learning, classification error is an important performance measure, which is minimized by the Bayes decision rule. In practice, the unknown true distribution is usually replaced with a model distribution estimated from the training data in the Bayes decision rule. This substitution introduces a mismatch between the Bayes error and the model-based classification error. In this work, we apply classification error bounds to study the relationship between the error mismatch and the Kullback-Leibler divergence in machine learning. Motivated by recent observations of low model-based classification errors in many machine learning tasks, bounding the Bayes error to be lower, we propose a linear approximation of the classification error bound for low Bayes error conditions. Then, the bound for class priors are discussed. Moreover, we extend the classification error bound for sequences. Using automatic speech recognition as a representative example of machine learning applications, this work analytically discusses the correlations among different performance measures with extended bounds, including cross-entropy loss, language model perplexity, and word error rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15977v1-abstract-full').style.display = 'none'; document.getElementById('2501.15977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15775">arXiv:2501.15775</a> <span> [<a href="https://arxiv.org/pdf/2501.15775">pdf</a>, <a href="https://arxiv.org/format/2501.15775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Do Existing Testing Tools Really Uncover Gender Bias in Text-to-Image Models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yunbo Lyu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhou Yang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yuqing Niu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jing Jiang</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+D">David Lo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15775v1-abstract-short" style="display: inline;"> Text-to-Image (T2I) models have recently gained significant attention due to their ability to generate high-quality images and are consequently used in a wide range of applications. However, there are concerns about the gender bias of these models. Previous studies have shown that T2I models can perpetuate or even amplify gender stereotypes when provided with neutral text prompts. Researchers have… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15775v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15775v1-abstract-full" style="display: none;"> Text-to-Image (T2I) models have recently gained significant attention due to their ability to generate high-quality images and are consequently used in a wide range of applications. However, there are concerns about the gender bias of these models. Previous studies have shown that T2I models can perpetuate or even amplify gender stereotypes when provided with neutral text prompts. Researchers have proposed automated gender bias uncovering detectors for T2I models, but a crucial gap exists: no existing work comprehensively compares the various detectors and understands how the gender bias detected by them deviates from the actual situation. This study addresses this gap by validating previous gender bias detectors using a manually labeled dataset and comparing how the bias identified by various detectors deviates from the actual bias in T2I models, as verified by manual confirmation. We create a dataset consisting of 6,000 images generated from three cutting-edge T2I models: Stable Diffusion XL, Stable Diffusion 3, and Dreamlike Photoreal 2.0. During the human-labeling process, we find that all three T2I models generate a portion (12.48% on average) of low-quality images (e.g., generate images with no face present), where human annotators cannot determine the gender of the person. Our analysis reveals that all three T2I models show a preference for generating male images, with SDXL being the most biased. Additionally, images generated using prompts containing professional descriptions (e.g., lawyer or doctor) show the most bias. We evaluate seven gender bias detectors and find that none fully capture the actual level of bias in T2I models, with some detectors overestimating bias by up to 26.95%. We further investigate the causes of inaccurate estimations, highlighting the limitations of detectors in dealing with low-quality images. Based on our findings, we propose an enhanced detector... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15775v1-abstract-full').style.display = 'none'; document.getElementById('2501.15775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15588">arXiv:2501.15588</a> <span> [<a href="https://arxiv.org/pdf/2501.15588">pdf</a>, <a href="https://arxiv.org/format/2501.15588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tumor Detection, Segmentation and Classification Challenge on Automated 3D Breast Ultrasound: The TDSC-ABUS Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gongning Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingwang Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinjie Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xing Tao</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+D">Dong Ni</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hyunsu Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chulhong Kim</a>, <a href="/search/cs?searchtype=author&query=Stock%2C+R">Raphael Stock</a>, <a href="/search/cs?searchtype=author&query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/cs?searchtype=author&query=Kirchhoff%2C+Y">Yannick Kirchhoff</a>, <a href="/search/cs?searchtype=author&query=Rokuss%2C+M">Maximilian Rokuss</a>, <a href="/search/cs?searchtype=author&query=Maier-Hein%2C+K">Klaus Maier-Hein</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhikai Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tianyu Fan</a>, <a href="/search/cs?searchtype=author&query=Boutry%2C+N">Nicolas Boutry</a>, <a href="/search/cs?searchtype=author&query=Tereshchenko%2C+D">Dmitry Tereshchenko</a>, <a href="/search/cs?searchtype=author&query=Moine%2C+A">Arthur Moine</a>, <a href="/search/cs?searchtype=author&query=Charmetant%2C+M">Maximilien Charmetant</a>, <a href="/search/cs?searchtype=author&query=Sauer%2C+J">Jan Sauer</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hao Du</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang-Hui Bai</a>, <a href="/search/cs?searchtype=author&query=Raikar%2C+V+P">Vipul Pai Raikar</a>, <a href="/search/cs?searchtype=author&query=Montoya-del-Angel%2C+R">Ricardo Montoya-del-Angel</a>, <a href="/search/cs?searchtype=author&query=Marti%2C+R">Robert Marti</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15588v1-abstract-short" style="display: inline;"> Breast cancer is one of the most common causes of death among women worldwide. Early detection helps in reducing the number of deaths. Automated 3D Breast Ultrasound (ABUS) is a newer approach for breast screening, which has many advantages over handheld mammography such as safety, speed, and higher detection rate of breast cancer. Tumor detection, segmentation, and classification are key componen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15588v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15588v1-abstract-full" style="display: none;"> Breast cancer is one of the most common causes of death among women worldwide. Early detection helps in reducing the number of deaths. Automated 3D Breast Ultrasound (ABUS) is a newer approach for breast screening, which has many advantages over handheld mammography such as safety, speed, and higher detection rate of breast cancer. Tumor detection, segmentation, and classification are key components in the analysis of medical images, especially challenging in the context of 3D ABUS due to the significant variability in tumor size and shape, unclear tumor boundaries, and a low signal-to-noise ratio. The lack of publicly accessible, well-labeled ABUS datasets further hinders the advancement of systems for breast tumor analysis. Addressing this gap, we have organized the inaugural Tumor Detection, Segmentation, and Classification Challenge on Automated 3D Breast Ultrasound 2023 (TDSC-ABUS2023). This initiative aims to spearhead research in this field and create a definitive benchmark for tasks associated with 3D ABUS image analysis. In this paper, we summarize the top-performing algorithms from the challenge and provide critical analysis for ABUS image examination. We offer the TDSC-ABUS challenge as an open-access platform at https://tdsc-abus2023.grand-challenge.org/ to benchmark and inspire future developments in algorithmic research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15588v1-abstract-full').style.display = 'none'; document.getElementById('2501.15588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15470">arXiv:2501.15470</a> <span> [<a href="https://arxiv.org/pdf/2501.15470">pdf</a>, <a href="https://arxiv.org/format/2501.15470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Potential of Multimodal Retrieval Augmented Generation with Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaohan Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhihan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15470v1-abstract-short" style="display: inline;"> Multimodal Retrieval Augmented Generation (MRAG) systems, while promising for enhancing Multimodal Large Language Models (MLLMs), often rely on rigid, single-step retrieval methods. This limitation hinders their ability to effectively address real-world scenarios that demand adaptive information acquisition and query refinement. To overcome this, we introduce the novel task of Multimodal Retrieval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15470v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15470v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15470v1-abstract-full" style="display: none;"> Multimodal Retrieval Augmented Generation (MRAG) systems, while promising for enhancing Multimodal Large Language Models (MLLMs), often rely on rigid, single-step retrieval methods. This limitation hinders their ability to effectively address real-world scenarios that demand adaptive information acquisition and query refinement. To overcome this, we introduce the novel task of Multimodal Retrieval Augmented Generation Planning (MRAG Planning), focusing on optimizing MLLM performance while minimizing computational overhead. We present CogPlanner, a versatile framework inspired by human cognitive processes. CogPlanner iteratively refines queries and selects retrieval strategies, enabling both parallel and sequential modeling approaches. To rigorously evaluate MRAG Planning, we introduce CogBench, a new benchmark specifically designed for this task. CogBench facilitates the integration of lightweight CogPlanner with resource-efficient MLLMs. Our experimental findings demonstrate that CogPlanner surpasses existing MRAG baselines, achieving significant improvements in both accuracy and efficiency with minimal computational overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15470v1-abstract-full').style.display = 'none'; document.getElementById('2501.15470v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15429">arXiv:2501.15429</a> <span> [<a href="https://arxiv.org/pdf/2501.15429">pdf</a>, <a href="https://arxiv.org/format/2501.15429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701551.3703528">10.1145/3701551.3703528 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An Aspect Performance-aware Hypergraph Neural Network for Review-based Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junrui Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zifang Tang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuan Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15429v1-abstract-short" style="display: inline;"> Online reviews allow consumers to provide detailed feedback on various aspects of items. Existing methods utilize these aspects to model users' fine-grained preferences for specific item features through graph neural networks. We argue that the performance of items on different aspects is important for making precise recommendations, which has not been taken into account by existing approaches, du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15429v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15429v1-abstract-full" style="display: none;"> Online reviews allow consumers to provide detailed feedback on various aspects of items. Existing methods utilize these aspects to model users' fine-grained preferences for specific item features through graph neural networks. We argue that the performance of items on different aspects is important for making precise recommendations, which has not been taken into account by existing approaches, due to lack of data. In this paper, we propose an aspect performance-aware hypergraph neural network (APH) for the review-based recommendation, which learns the performance of items from the conflicting sentiment polarity of user reviews. Specifically, APH comprehensively models the relationships among users, items, aspects, and sentiment polarity by systematically constructing an aspect hypergraph based on user reviews. In addition, APH aggregates aspects representing users and items by employing an aspect performance-aware hypergraph aggregation method. It aggregates the sentiment polarities from multiple users by jointly considering user preferences and the semantics of their sentiments, determining the weights of sentiment polarities to infer the performance of items on various aspects. Such performances are then used as weights to aggregate neighboring aspects. Experiments on six real-world datasets demonstrate that APH improves MSE, Precision@5, and Recall@5 by an average of 2.30%, 4.89%, and 1.60% over the best baseline. The source code and data are available at https://github.com/dianziliu/APH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15429v1-abstract-full').style.display = 'none'; document.getElementById('2501.15429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, accepted by WSDM'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14600">arXiv:2501.14600</a> <span> [<a href="https://arxiv.org/pdf/2501.14600">pdf</a>, <a href="https://arxiv.org/format/2501.14600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> On the Homophily of Heterogeneous Graphs: Understanding and Unleashing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhen Tao</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Z">Ziyue Qiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoqi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyi Yang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+L">Lun Du</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qingqiang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14600v1-abstract-short" style="display: inline;"> Homophily, the tendency of similar nodes to connect, is a fundamental phenomenon in network science and a critical factor in the performance of graph neural networks (GNNs). While existing studies primarily explore homophily in homogeneous graphs, where nodes share the same type, real-world networks are often more accurately modeled as heterogeneous graphs (HGs) with diverse node types and intrica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14600v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14600v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14600v1-abstract-full" style="display: none;"> Homophily, the tendency of similar nodes to connect, is a fundamental phenomenon in network science and a critical factor in the performance of graph neural networks (GNNs). While existing studies primarily explore homophily in homogeneous graphs, where nodes share the same type, real-world networks are often more accurately modeled as heterogeneous graphs (HGs) with diverse node types and intricate cross-type interactions. This structural diversity complicates the analysis of homophily, as traditional homophily metrics fail to account for distinct label spaces across node types. To address this limitation, we introduce the Cross-Type Homophily Ratio, a novel metric that quantifies homophily based on the similarity of target information across different node types. Furthermore, we introduce Cross-Type Homophily-guided Heterogeneous Graph Pruning, a method designed to selectively remove low-homophily crosstype edges, thereby enhancing the Cross-Type Homophily Ratio and boosting the performance of heterogeneous graph neural networks (HGNNs). Extensive experiments on five real-world HG datasets validate the effectiveness of our approach, which delivers up to 13.36% average relative performance improvement for HGNNs, offering a fresh perspective on cross-type homophily in heterogeneous graph learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14600v1-abstract-full').style.display = 'none'; document.getElementById('2501.14600v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13742">arXiv:2501.13742</a> <span> [<a href="https://arxiv.org/pdf/2501.13742">pdf</a>, <a href="https://arxiv.org/format/2501.13742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of Retrieval-Augmented Code Generation: Challenges and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zezhou Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sirong Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cuiyun Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenhao Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kui Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13742v1-abstract-short" style="display: inline;"> Code generation aims to automatically generate code snippets of specific programming language according to natural language descriptions. The continuous advancements in deep learning, particularly pre-trained models, have empowered the code generation task to achieve remarkable performance. One main challenge of pre-trained models for code generation is the semantic gap between natural language re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13742v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13742v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13742v1-abstract-full" style="display: none;"> Code generation aims to automatically generate code snippets of specific programming language according to natural language descriptions. The continuous advancements in deep learning, particularly pre-trained models, have empowered the code generation task to achieve remarkable performance. One main challenge of pre-trained models for code generation is the semantic gap between natural language requirements and source code. To address the issue, prior studies typically adopt a retrieval-augmented framework for the task, where the similar code snippets collected by a retrieval process can be leveraged to help understand the requirements and provide guidance for the generation process. However, there is a lack of systematic study on the application of this framework for code generation, including the impact of the final generated results and the specific usage of the framework. In this paper, we choose three popular pre-trained code models, namely CodeGen, UniXcoder, and CodeT5, to assess the impact of the quality and utilization of retrieved code on the retrieval-augmented framework. Our analysis shows that the retrieval-augmented framework is beneficial for improving the performance of the existing pre-trained models. We also provide suggestions on the utilization of the retrieval-augmented code generation framework: BM25 and Sequential Integration Fusion are recommended due to their convenience and superior performance. Sketch Filling Fusion, which extracts a sketch of relevant code, could help the model improve its performance further. Additionally, we conduct experiments to investigate the influence of the retrieval-augmented framework on large language models for code generation, showing the effectiveness of the framework, and we discuss the trade-off between performance improvement and computational costs in each phase within the framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13742v1-abstract-full').style.display = 'none'; document.getElementById('2501.13742v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by TOSEM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13629">arXiv:2501.13629</a> <span> [<a href="https://arxiv.org/pdf/2501.13629">pdf</a>, <a href="https://arxiv.org/format/2501.13629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Sigma: Differential Rescaling of Query, Key and Value for Efficient Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhenghao Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zihao Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Ying Xin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyue Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kailai Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yu Yan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiao Liang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shuai Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiming Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zheheng Luo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lei Qu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xuan Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yuqing Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feiyang Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuting Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yasen Hu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+H">Hao Ni</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guoshuai Zhao</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13629v2-abstract-short" style="display: inline;"> We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13629v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13629v2-abstract-full" style="display: none;"> We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, based on their varying impacts on the model performance and efficiency indicators. Specifically, we (1) conduct extensive experiments that demonstrate the model's varying sensitivity to the compression of K and V components, leading to the development of differentially compressed KV, and (2) propose augmented Q to expand the Q head dimension, which enhances the model's representation capacity with minimal impacts on the inference speed. Rigorous theoretical and empirical analyses reveal that DiffQKV attention significantly enhances efficiency, achieving up to a 33.36% improvement in inference speed over the conventional grouped-query attention (GQA) in long-context scenarios. We pre-train Sigma on 6T tokens from various sources, including 19.5B system domain data that we carefully collect and 1T tokens of synthesized and rewritten data. In general domains, Sigma achieves comparable performance to other state-of-arts models. In the system domain, we introduce the first comprehensive benchmark AIMicius, where Sigma demonstrates remarkable performance across all tasks, significantly outperforming GPT-4 with an absolute improvement up to 52.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13629v2-abstract-full').style.display = 'none'; document.getElementById('2501.13629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13468">arXiv:2501.13468</a> <span> [<a href="https://arxiv.org/pdf/2501.13468">pdf</a>, <a href="https://arxiv.org/format/2501.13468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Streaming Video Understanding and Multi-round Interaction with Memory-enhanced Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haomiao Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiazuo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhuge%2C+Y">Yunzhi Zhuge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawen Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13468v1-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have enabled the development of Video-LLMs, advancing multimodal learning by bridging video data with language tasks. However, current video understanding models struggle with processing long video sequences, supporting multi-turn dialogues, and adapting to real-world dynamic scenarios. To address these issues, we propose StreamChat, a training-free… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13468v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13468v1-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have enabled the development of Video-LLMs, advancing multimodal learning by bridging video data with language tasks. However, current video understanding models struggle with processing long video sequences, supporting multi-turn dialogues, and adapting to real-world dynamic scenarios. To address these issues, we propose StreamChat, a training-free framework for streaming video reasoning and conversational interaction. $\StreamChat$ leverages a novel hierarchical memory system to efficiently process and compress video features over extended sequences, enabling real-time, multi-turn dialogue. Our framework incorporates a parallel system scheduling strategy that enhances processing speed and reduces latency, ensuring robust performance in real-world applications. Furthermore, we introduce StreamBench, a versatile benchmark that evaluates streaming video understanding across diverse media types and interactive scenarios, including multi-turn interactions and complex reasoning tasks. Extensive evaluations on StreamBench and other public benchmarks demonstrate that StreamChat significantly outperforms existing state-of-the-art models in terms of accuracy and response times, confirming its effectiveness for streaming video understanding. Code is available at StreamChat: https://github.com/hmxiong/StreamChat. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13468v1-abstract-full').style.display = 'none'; document.getElementById('2501.13468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025. Code is available at https://github.com/hmxiong/StreamChat</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12935">arXiv:2501.12935</a> <span> [<a href="https://arxiv.org/pdf/2501.12935">pdf</a>, <a href="https://arxiv.org/format/2501.12935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Object Manipulation in a Single Image using Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruisi Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zechuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12935v1-abstract-short" style="display: inline;"> Object manipulation in images aims to not only edit the object's presentation but also gift objects with motion. Previous methods encountered challenges in concurrently handling static editing and dynamic generation, while also struggling to achieve fidelity in object appearance and scene lighting. In this work, we introduce \textbf{OMG3D}, a novel framework that integrates the precise geometric c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12935v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12935v1-abstract-full" style="display: none;"> Object manipulation in images aims to not only edit the object's presentation but also gift objects with motion. Previous methods encountered challenges in concurrently handling static editing and dynamic generation, while also struggling to achieve fidelity in object appearance and scene lighting. In this work, we introduce \textbf{OMG3D}, a novel framework that integrates the precise geometric control with the generative power of diffusion models, thus achieving significant enhancements in visual performance. Our framework first converts 2D objects into 3D, enabling user-directed modifications and lifelike motions at the geometric level. To address texture realism, we propose CustomRefiner, a texture refinement module that pre-train a customized diffusion model, aligning the details and style of coarse renderings of 3D rough model with the original image, further refine the texture. Additionally, we introduce IllumiCombiner, a lighting processing module that estimates and corrects background lighting to match human visual perception, resulting in more realistic shadow effects. Extensive experiments demonstrate the outstanding visual performance of our approach in both static and dynamic scenarios. Remarkably, all these steps can be done using one NVIDIA 3090. Project page is at https://whalesong-zrs.github.io/OMG3D-projectpage/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12935v1-abstract-full').style.display = 'none'; document.getElementById('2501.12935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>