CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,167 results for author: <span class="mathjax">Liu, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Liu%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Liu, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Liu%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Liu, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Liu%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v1-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v1-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'none'; document.getElementById('2502.10248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09785">arXiv:2502.09785</a> <span> [<a href="https://arxiv.org/pdf/2502.09785">pdf</a>, <a href="https://arxiv.org/format/2502.09785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Accelerator-assisted Floating-point ASIP for Communication and Positioning in Massive MIMO Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Attari%2C+M">Mohammad Attari</a>, <a href="/search/cs?searchtype=author&query=Edfors%2C+O">Ove Edfors</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09785v1-abstract-short" style="display: inline;"> This paper presents an implementation of a floating-point-capable application-specific instruction set processor (ASIP) for both communication and positioning tasks using the massive multiple-input multiple-output (MIMO) technology. The ASIP is geared with vector processing capabilities in the form of single instruction multiple data (SIMD). A dual-pronged accelerator composition assists the proce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09785v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09785v1-abstract-full" style="display: none;"> This paper presents an implementation of a floating-point-capable application-specific instruction set processor (ASIP) for both communication and positioning tasks using the massive multiple-input multiple-output (MIMO) technology. The ASIP is geared with vector processing capabilities in the form of single instruction multiple data (SIMD). A dual-pronged accelerator composition assists the processor to tame the heavier mathematical workloads. A standalone systolic array accelerator accompanies the processor to aid with matrix multiplications. A parallel vector memory subsystem provides functionalities to both the processor and the systolic array. Additionally, A convolutional neural network (CNN) module accelerator, which is paired with its own separate vector memory, works hand in glove with the processor to take on the positioning task. The processor is synthesized in 22 nm fully depleted silicon-on-insulator (FD-SOI) technology running at a clock frequency of 800 MHz. The system achieves a maximum detection throughput of 2.1 Gb/s in a 128x16 massive MIMO system for the user equipment (UE) speed of 50km/h. The localization throughput settles at around 390 positionings/s. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09785v1-abstract-full').style.display = 'none'; document.getElementById('2502.09785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09650">arXiv:2502.09650</a> <span> [<a href="https://arxiv.org/pdf/2502.09650">pdf</a>, <a href="https://arxiv.org/format/2502.09650">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Principled Data Selection for Alignment: The Hidden Risks of Difficult Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chengqian Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haonan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zeke Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiqiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09650v1-abstract-short" style="display: inline;"> The alignment of large language models (LLMs) often assumes that using more clean data yields better outcomes, overlooking the match between model capacity and example difficulty. Challenging this, we propose a new principle: Preference data vary in difficulty, and overly difficult examples hinder alignment, by exceeding the model's capacity. Through systematic experimentation, we validate this pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09650v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09650v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09650v1-abstract-full" style="display: none;"> The alignment of large language models (LLMs) often assumes that using more clean data yields better outcomes, overlooking the match between model capacity and example difficulty. Challenging this, we propose a new principle: Preference data vary in difficulty, and overly difficult examples hinder alignment, by exceeding the model's capacity. Through systematic experimentation, we validate this principle with three key findings: (1) preference examples vary in difficulty, as evidenced by consistent learning orders across alignment runs; (2) overly difficult examples significantly degrade performance across four LLMs and two datasets; and (3) the capacity of a model dictates its threshold for handling difficult examples, underscoring a critical relationship between data selection and model capacity. Building on this principle, we introduce Selective DPO, which filters out overly difficult examples. This simple adjustment improves alignment performance by 9-16% in win rates on the AlpacaEval 2 benchmark compared to the DPO baseline, suppressing a series of DPO variants with different algorithmic adjustments. Together, these results illuminate the importance of aligning data difficulty with model capacity, offering a transformative perspective for improving alignment strategies in LLMs. Code is available at https://github.com/glorgao/SelectiveDPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09650v1-abstract-full').style.display = 'none'; document.getElementById('2502.09650v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08946">arXiv:2502.08946</a> <span> [<a href="https://arxiv.org/pdf/2502.08946">pdf</a>, <a href="https://arxiv.org/format/2502.08946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Stochastic Parrot on LLM's Shoulder: A Summative Assessment of Physical Concept Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+M">Mo Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lemao Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junjie Wu</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+T+T">Tsz Ting Chung</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiangnan Li</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08946v1-abstract-short" style="display: inline;"> In a systematic way, we investigate a widely asked question: Do LLMs really understand what they say?, which relates to the more familiar term Stochastic Parrot. To this end, we propose a summative assessment over a carefully designed physical concept understanding task, PhysiCo. Our task alleviates the memorization issue via the usage of grid-format inputs that abstractly describe physical phenom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08946v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08946v1-abstract-full" style="display: none;"> In a systematic way, we investigate a widely asked question: Do LLMs really understand what they say?, which relates to the more familiar term Stochastic Parrot. To this end, we propose a summative assessment over a carefully designed physical concept understanding task, PhysiCo. Our task alleviates the memorization issue via the usage of grid-format inputs that abstractly describe physical phenomena. The grids represents varying levels of understanding, from the core phenomenon, application examples to analogies to other abstract patterns in the grid world. A comprehensive study on our task demonstrates: (1) state-of-the-art LLMs, including GPT-4o, o1 and Gemini 2.0 flash thinking, lag behind humans by ~40%; (2) the stochastic parrot phenomenon is present in LLMs, as they fail on our grid task but can describe and recognize the same concepts well in natural language; (3) our task challenges the LLMs due to intrinsic difficulties rather than the unfamiliar grid format, as in-context learning and fine-tuning on same formatted data added little to their performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08946v1-abstract-full').style.display = 'none'; document.getElementById('2502.08946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Main Conference. First 5 authors contributed equally. Project page: https://physico-benchmark.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08309">arXiv:2502.08309</a> <span> [<a href="https://arxiv.org/pdf/2502.08309">pdf</a>, <a href="https://arxiv.org/format/2502.08309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Scaling Law in Industrial Recommendation Systems with a Three-step Paradigm based Large User Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+B">Bencheng Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilei Liu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yizhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yujin Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Langming Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Pengjie%2C+W">Wang Pengjie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08309v1-abstract-short" style="display: inline;"> Recent advancements in autoregressive Large Language Models (LLMs) have achieved significant milestones, largely attributed to their scalability, often referred to as the "scaling law". Inspired by these achievements, there has been a growing interest in adapting LLMs for Recommendation Systems (RecSys) by reformulating RecSys tasks into generative problems. However, these End-to-End Generative Re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08309v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08309v1-abstract-full" style="display: none;"> Recent advancements in autoregressive Large Language Models (LLMs) have achieved significant milestones, largely attributed to their scalability, often referred to as the "scaling law". Inspired by these achievements, there has been a growing interest in adapting LLMs for Recommendation Systems (RecSys) by reformulating RecSys tasks into generative problems. However, these End-to-End Generative Recommendation (E2E-GR) methods tend to prioritize idealized goals, often at the expense of the practical advantages offered by traditional Deep Learning based Recommendation Models (DLRMs) in terms of in features, architecture, and practices. This disparity between idealized goals and practical needs introduces several challenges and limitations, locking the scaling law in industrial RecSys. In this paper, we introduce a large user model (LUM) that addresses these limitations through a three-step paradigm, designed to meet the stringent requirements of industrial settings while unlocking the potential for scalable recommendations. Our extensive experimental evaluations demonstrate that LUM outperforms both state-of-the-art DLRMs and E2E-GR approaches. Notably, LUM exhibits excellent scalability, with performance improvements observed as the model scales up to 7 billion parameters. Additionally, we have successfully deployed LUM in an industrial application, where it achieved significant gains in an A/B test, further validating its effectiveness and practicality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08309v1-abstract-full').style.display = 'none'; document.getElementById('2502.08309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08302">arXiv:2502.08302</a> <span> [<a href="https://arxiv.org/pdf/2502.08302">pdf</a>, <a href="https://arxiv.org/format/2502.08302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HDT: Hierarchical Discrete Transformer for Multivariate Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shibo Feng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pengcheng Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiqi Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08302v1-abstract-short" style="display: inline;"> Generative models have gained significant attention in multivariate time series forecasting (MTS), particularly due to their ability to generate high-fidelity samples. Forecasting the probability distribution of multivariate time series is a challenging yet practical task. Although some recent attempts have been made to handle this task, two major challenges persist: 1) some existing generative me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08302v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08302v1-abstract-full" style="display: none;"> Generative models have gained significant attention in multivariate time series forecasting (MTS), particularly due to their ability to generate high-fidelity samples. Forecasting the probability distribution of multivariate time series is a challenging yet practical task. Although some recent attempts have been made to handle this task, two major challenges persist: 1) some existing generative methods underperform in high-dimensional multivariate time series forecasting, which is hard to scale to higher dimensions; 2) the inherent high-dimensional multivariate attributes constrain the forecasting lengths of existing generative models. In this paper, we point out that discrete token representations can model high-dimensional MTS with faster inference time, and forecasting the target with long-term trends of itself can extend the forecasting length with high accuracy. Motivated by this, we propose a vector quantized framework called Hierarchical Discrete Transformer (HDT) that models time series into discrete token representations with l2 normalization enhanced vector quantized strategy, in which we transform the MTS forecasting into discrete tokens generation. To address the limitations of generative models in long-term forecasting, we propose a hierarchical discrete Transformer. This model captures the discrete long-term trend of the target at the low level and leverages this trend as a condition to generate the discrete representation of the target at the high level that introduces the features of the target itself to extend the forecasting length in high-dimensional MTS. Extensive experiments on five popular MTS datasets verify the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08302v1-abstract-full').style.display = 'none'; document.getElementById('2502.08302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07190">arXiv:2502.07190</a> <span> [<a href="https://arxiv.org/pdf/2502.07190">pdf</a>, <a href="https://arxiv.org/format/2502.07190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding LLMs' Fluid Intelligence Deficiency: An Analysis of the ARC Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junjie Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Mo Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lemao Liu</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07190v1-abstract-short" style="display: inline;"> While LLMs have exhibited strong performance on various NLP tasks, it is noteworthy that most of these tasks rely on utilizing the vast amount of knowledge encoded in LLMs' parameters, rather than solving new problems without prior knowledge. In cognitive research, the latter ability is referred to as fluid intelligence, which is considered to be critical for assessing human intelligence. Recent r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07190v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07190v1-abstract-full" style="display: none;"> While LLMs have exhibited strong performance on various NLP tasks, it is noteworthy that most of these tasks rely on utilizing the vast amount of knowledge encoded in LLMs' parameters, rather than solving new problems without prior knowledge. In cognitive research, the latter ability is referred to as fluid intelligence, which is considered to be critical for assessing human intelligence. Recent research on fluid intelligence assessments has highlighted significant deficiencies in LLMs' abilities. In this paper, we analyze the challenges LLMs face in demonstrating fluid intelligence through controlled experiments, using the most representative ARC task as an example. Our study revealed three major limitations in existing LLMs: limited ability for skill composition, unfamiliarity with abstract input formats, and the intrinsic deficiency of left-to-right decoding. Our data and code can be found in https://wujunjie1998.github.io/araoc-benchmark.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07190v1-abstract-full').style.display = 'none'; document.getElementById('2502.07190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 9 figures, accepted by NAACL 2025 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06827">arXiv:2502.06827</a> <span> [<a href="https://arxiv.org/pdf/2502.06827">pdf</a>, <a href="https://arxiv.org/format/2502.06827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNNLS.2022.3202842">10.1109/TNNLS.2022.3202842 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning to Synthesize Compatible Fashion Items Using Semantic Alignment and Collocation Classification: An Outfit Generation Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dongliang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haijun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linlin Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Han Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaofei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06827v1-abstract-short" style="display: inline;"> The field of fashion compatibility learning has attracted great attention from both the academic and industrial communities in recent years. Many studies have been carried out for fashion compatibility prediction, collocated outfit recommendation, artificial intelligence (AI)-enabled compatible fashion design, and related topics. In particular, AI-enabled compatible fashion design can be used to s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06827v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06827v1-abstract-full" style="display: none;"> The field of fashion compatibility learning has attracted great attention from both the academic and industrial communities in recent years. Many studies have been carried out for fashion compatibility prediction, collocated outfit recommendation, artificial intelligence (AI)-enabled compatible fashion design, and related topics. In particular, AI-enabled compatible fashion design can be used to synthesize compatible fashion items or outfits in order to improve the design experience for designers or the efficacy of recommendations for customers. However, previous generative models for collocated fashion synthesis have generally focused on the image-to-image translation between fashion items of upper and lower clothing. In this paper, we propose a novel outfit generation framework, i.e., OutfitGAN, with the aim of synthesizing a set of complementary items to compose an entire outfit, given one extant fashion item and reference masks of target synthesized items. OutfitGAN includes a semantic alignment module, which is responsible for characterizing the mapping correspondence between the existing fashion items and the synthesized ones, to improve the quality of the synthesized images, and a collocation classification module, which is used to improve the compatibility of a synthesized outfit. In order to evaluate the performance of our proposed models, we built a large-scale dataset consisting of 20,000 fashion outfits. Extensive experimental results on this dataset show that our OutfitGAN can synthesize photo-realistic outfits and outperform state-of-the-art methods in terms of similarity, authenticity and compatibility measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06827v1-abstract-full').style.display = 'none'; document.getElementById('2502.06827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted by IEEE TNNLS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06823">arXiv:2502.06823</a> <span> [<a href="https://arxiv.org/pdf/2502.06823">pdf</a>, <a href="https://arxiv.org/format/2502.06823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> CTR-Driven Advertising Image Generation with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingye Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenbang Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weizhen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanyin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haohan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linkai Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jingjing Lv</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junjie Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangang Lin</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jingping Shao</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuanjie Shao</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xinge You</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changxin Gao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+N">Nong Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06823v1-abstract-short" style="display: inline;"> In web data, advertising images are crucial for capturing user attention and improving advertising effectiveness. Most existing methods generate background for products primarily focus on the aesthetic quality, which may fail to achieve satisfactory online performance. To address this limitation, we explore the use of Multimodal Large Language Models (MLLMs) for generating advertising images by op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06823v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06823v1-abstract-full" style="display: none;"> In web data, advertising images are crucial for capturing user attention and improving advertising effectiveness. Most existing methods generate background for products primarily focus on the aesthetic quality, which may fail to achieve satisfactory online performance. To address this limitation, we explore the use of Multimodal Large Language Models (MLLMs) for generating advertising images by optimizing for Click-Through Rate (CTR) as the primary objective. Firstly, we build targeted pre-training tasks, and leverage a large-scale e-commerce multimodal dataset to equip MLLMs with initial capabilities for advertising image generation tasks. To further improve the CTR of generated images, we propose a novel reward model to fine-tune pre-trained MLLMs through Reinforcement Learning (RL), which can jointly utilize multimodal features and accurately reflect user click preferences. Meanwhile, a product-centric preference optimization strategy is developed to ensure that the generated background content aligns with the product characteristics after fine-tuning, enhancing the overall relevance and effectiveness of the advertising images. Extensive experiments have demonstrated that our method achieves state-of-the-art performance in both online and offline metrics. Our code and pre-trained models are publicly available at: https://github.com/Chenguoz/CAIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06823v1-abstract-full').style.display = 'none'; document.getElementById('2502.06823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05849">arXiv:2502.05849</a> <span> [<a href="https://arxiv.org/pdf/2502.05849">pdf</a>, <a href="https://arxiv.org/format/2502.05849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fact-or-Fair: A Checklist for Behavioral Testing of AI Models on Fairness-Related Queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-tse Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuhang Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linqi Liu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yixin Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+M+R">Michael R. Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05849v1-abstract-short" style="display: inline;"> The generation of incorrect images, such as depictions of people of color in Nazi-era uniforms by Gemini, frustrated users and harmed Google's reputation, motivating us to investigate the relationship between accurately reflecting factuality and promoting diversity and equity. In this study, we focus on 19 real-world statistics collected from authoritative sources. Using these statistics, we devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05849v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05849v1-abstract-full" style="display: none;"> The generation of incorrect images, such as depictions of people of color in Nazi-era uniforms by Gemini, frustrated users and harmed Google's reputation, motivating us to investigate the relationship between accurately reflecting factuality and promoting diversity and equity. In this study, we focus on 19 real-world statistics collected from authoritative sources. Using these statistics, we develop a checklist comprising objective and subjective queries to analyze behavior of large language models (LLMs) and text-to-image (T2I) models. Objective queries assess the models' ability to provide accurate world knowledge. In contrast, the design of subjective queries follows a key principle: statistical or experiential priors should not be overgeneralized to individuals, ensuring that models uphold diversity. These subjective queries are based on three common human cognitive errors that often result in social biases. We propose metrics to assess factuality and fairness, and formally prove the inherent trade-off between these two aspects. Results show that GPT-4o and DALL-E 3 perform notably well among six LLMs and four T2I models. Our code is publicly available at https://github.com/uclanlp/Fact-or-Fair. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05849v1-abstract-full').style.display = 'none'; document.getElementById('2502.05849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages of main text; 7 pages of appendices;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05787">arXiv:2502.05787</a> <span> [<a href="https://arxiv.org/pdf/2502.05787">pdf</a>, <a href="https://arxiv.org/format/2502.05787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> TAP-CAM: A Tunable Approximate Matching Engine based on Ferroelectric Content Addressable Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chenyu Ni</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sijie Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Che-Kai Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Imani%2C+M">Mohsen Imani</a>, <a href="/search/cs?searchtype=author&query=Kampfe%2C+T">Thomas Kampfe</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+K">Kai Ni</a>, <a href="/search/cs?searchtype=author&query=Niemier%2C+M">Michael Niemier</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X+S">Xiaobo Sharon Hu</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+C">Cheng Zhuo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xunzhao Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05787v1-abstract-short" style="display: inline;"> Pattern search is crucial in numerous analytic applications for retrieving data entries akin to the query. Content Addressable Memories (CAMs), an in-memory computing fabric, directly compare input queries with stored entries through embedded comparison logic, facilitating fast parallel pattern search in memory. While conventional CAM designs offer exact match functionality, they are inadequate fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05787v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05787v1-abstract-full" style="display: none;"> Pattern search is crucial in numerous analytic applications for retrieving data entries akin to the query. Content Addressable Memories (CAMs), an in-memory computing fabric, directly compare input queries with stored entries through embedded comparison logic, facilitating fast parallel pattern search in memory. While conventional CAM designs offer exact match functionality, they are inadequate for meeting the approximate search needs of emerging data-intensive applications. Some recent CAM designs propose approximate matching functions, but they face limitations such as excessively large cell area or the inability to precisely control the degree of approximation. In this paper, we propose TAP-CAM, a novel ferroelectric field effect transistor (FeFET) based ternary CAM (TCAM) capable of both exact and tunable approximate matching. TAP-CAM employs a compact 2FeFET-2R cell structure as the entry storage unit, and similarities in Hamming distances between input queries and stored entries are measured using an evaluation transistor associated with the matchline of CAM array. The operation, robustness and performance of the proposed design at array level have been discussed and evaluated, respectively. We conduct a case study of K-nearest neighbor (KNN) search to benchmark the proposed TAP-CAM at application level. Results demonstrate that compared to 16T CMOS CAM with exact match functionality, TAP-CAM achieves a 16.95x energy improvement, along with a 3.06% accuracy enhancement. Compared to 2FeFET TCAM with approximate match functionality, TAP-CAM achieves a 6.78x energy improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05787v1-abstract-full').style.display = 'none'; document.getElementById('2502.05787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05498">arXiv:2502.05498</a> <span> [<a href="https://arxiv.org/pdf/2502.05498">pdf</a>, <a href="https://arxiv.org/format/2502.05498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Riemannian Manifold Learning for Stackelberg Games with Neural Flow Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Larkin Liu</a>, <a href="/search/cs?searchtype=author&query=Rasul%2C+K">Kashif Rasul</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+Y">Yutong Chao</a>, <a href="/search/cs?searchtype=author&query=Etesami%2C+J">Jalal Etesami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05498v1-abstract-short" style="display: inline;"> We present a novel framework for online learning in Stackelberg general-sum games, where two agents, the leader and follower, engage in sequential turn-based interactions. At the core of this approach is a learned diffeomorphism that maps the joint action space to a smooth Riemannian manifold, referred to as the Stackelberg manifold. This mapping, facilitated by neural normalizing flows, ensures t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05498v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05498v1-abstract-full" style="display: none;"> We present a novel framework for online learning in Stackelberg general-sum games, where two agents, the leader and follower, engage in sequential turn-based interactions. At the core of this approach is a learned diffeomorphism that maps the joint action space to a smooth Riemannian manifold, referred to as the Stackelberg manifold. This mapping, facilitated by neural normalizing flows, ensures the formation of tractable isoplanar subspaces, enabling efficient techniques for online learning. By assuming linearity between the agents' reward functions on the Stackelberg manifold, our construct allows the application of standard bandit algorithms. We then provide a rigorous theoretical basis for regret minimization on convex manifolds and establish finite-time bounds on simple regret for learning Stackelberg equilibria. This integration of manifold learning into game theory uncovers a previously unrecognized potential for neural normalizing flows as an effective tool for multi-agent learning. We present empirical results demonstrating the effectiveness of our approach compared to standard baselines, with applications spanning domains such as cybersecurity and economic supply chain optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05498v1-abstract-full').style.display = 'none'; document.getElementById('2502.05498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Stackelberg games. Manifold learning. Online learning</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 91A10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.11 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04600">arXiv:2502.04600</a> <span> [<a href="https://arxiv.org/pdf/2502.04600">pdf</a>, <a href="https://arxiv.org/format/2502.04600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cooperative Payload Estimation by a Team of Mocobots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C+L">C. Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Elwin%2C+M+L">Matthew L. Elwin</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+R+A">Randy A. Freeman</a>, <a href="/search/cs?searchtype=author&query=Lynch%2C+K+M">Kevin M. Lynch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04600v1-abstract-short" style="display: inline;"> Consider the following scenario: a human guides multiple mobile manipulators to grasp a common payload. For subsequent high-performance autonomous manipulation of the payload by the mobile manipulator team, or for collaborative manipulation with the human, the robots should be able to discover where the other robots are attached to the payload, as well as the payload's mass and inertial properties… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04600v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04600v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04600v1-abstract-full" style="display: none;"> Consider the following scenario: a human guides multiple mobile manipulators to grasp a common payload. For subsequent high-performance autonomous manipulation of the payload by the mobile manipulator team, or for collaborative manipulation with the human, the robots should be able to discover where the other robots are attached to the payload, as well as the payload's mass and inertial properties. In this paper, we describe a method for the robots to autonomously discover this information. The robots cooperatively manipulate the payload, and the twist, twist derivative, and wrench data at their grasp frames are used to estimate the transformation matrices between the grasp frames, the location of the payload's center of mass, and the payload's inertia matrix. The method is validated experimentally with a team of three mobile cobots, or mocobots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04600v1-abstract-full').style.display = 'none'; document.getElementById('2502.04600v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures. Submitted to IEEE Robotics and Automation Letters (RA-L)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04492">arXiv:2502.04492</a> <span> [<a href="https://arxiv.org/pdf/2502.04492">pdf</a>, <a href="https://arxiv.org/format/2502.04492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Reinforcement Learning with Focal Diversity Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tekin%2C+S+F">Selim Furkan Tekin</a>, <a href="/search/cs?searchtype=author&query=Ilhan%2C+F">Fatih Ilhan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiansheng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sihao Hu</a>, <a href="/search/cs?searchtype=author&query=Yahn%2C+Z">Zachary Yahn</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04492v1-abstract-short" style="display: inline;"> The advancement of Large Language Models (LLMs) and their finetuning strategies has triggered the renewed interests in multi-agent reinforcement learning. In this paper, we introduce a focal diversity-optimized multi-agent reinforcement learning approach, coined as MARL-Focal, with three unique characteristics. First, we develop an agent-fusion framework for encouraging multiple LLM based agents t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04492v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04492v1-abstract-full" style="display: none;"> The advancement of Large Language Models (LLMs) and their finetuning strategies has triggered the renewed interests in multi-agent reinforcement learning. In this paper, we introduce a focal diversity-optimized multi-agent reinforcement learning approach, coined as MARL-Focal, with three unique characteristics. First, we develop an agent-fusion framework for encouraging multiple LLM based agents to collaborate in producing the final inference output for each LLM query. Second, we develop a focal-diversity optimized agent selection algorithm that can choose a small subset of the available agents based on how well they can complement one another to generate the query output. Finally, we design a conflict-resolution method to detect output inconsistency among multiple agents and produce our MARL-Focal output through reward-aware and policy-adaptive inference fusion. Extensive evaluations on five benchmarks show that MARL-Focal is cost-efficient and adversarial-robust. Our multi-agent fusion model achieves performance improvement of 5.51\% compared to the best individual LLM-agent and offers stronger robustness over the TruthfulQA benchmark. Code is available at https://github.com/sftekin/rl-focal <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04492v1-abstract-full').style.display = 'none'; document.getElementById('2502.04492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03799">arXiv:2502.03799</a> <span> [<a href="https://arxiv.org/pdf/2502.03799">pdf</a>, <a href="https://arxiv.org/format/2502.03799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Hallucination Detection through Noise Injection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Litian Liu</a>, <a href="/search/cs?searchtype=author&query=Pourreza%2C+R">Reza Pourreza</a>, <a href="/search/cs?searchtype=author&query=Panchal%2C+S">Sunny Panchal</a>, <a href="/search/cs?searchtype=author&query=Bhattacharyya%2C+A">Apratim Bhattacharyya</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yao Qin</a>, <a href="/search/cs?searchtype=author&query=Memisevic%2C+R">Roland Memisevic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03799v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) are prone to generating plausible yet incorrect responses, known as hallucinations. Effectively detecting hallucinations is therefore crucial for the safe deployment of LLMs. Recent research has linked hallucinations to model uncertainty, suggesting that hallucinations can be detected by measuring dispersion over answer distributions obtained from a set of samples draw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03799v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03799v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03799v2-abstract-full" style="display: none;"> Large Language Models (LLMs) are prone to generating plausible yet incorrect responses, known as hallucinations. Effectively detecting hallucinations is therefore crucial for the safe deployment of LLMs. Recent research has linked hallucinations to model uncertainty, suggesting that hallucinations can be detected by measuring dispersion over answer distributions obtained from a set of samples drawn from a model. While drawing from the distribution over tokens defined by the model is a natural way to obtain samples, in this work, we argue that it is sub-optimal for the purpose of detecting hallucinations. We show that detection can be improved significantly by taking into account model uncertainty in the Bayesian sense. To this end, we propose a very simple and efficient approach that perturbs an appropriate subset of model parameters, or equivalently hidden unit activations, during sampling. We demonstrate its effectiveness across a wide range of datasets and model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03799v2-abstract-full').style.display = 'none'; document.getElementById('2502.03799v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02607">arXiv:2502.02607</a> <span> [<a href="https://arxiv.org/pdf/2502.02607">pdf</a>, <a href="https://arxiv.org/format/2502.02607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MIND: Microstructure INverse Design with Generative Hybrid Neural Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianyang Xue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haochen Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longdu Liu</a>, <a href="/search/cs?searchtype=author&query=Henderson%2C+P">Paul Henderson</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+P">Pengbin Tang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lin Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jikai Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haisen Zhao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Bickel%2C+B">Bernd Bickel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02607v1-abstract-short" style="display: inline;"> The inverse design of microstructures plays a pivotal role in optimizing metamaterials with specific, targeted physical properties. While traditional forward design methods are constrained by their inability to explore the vast combinatorial design space, inverse design offers a compelling alternative by directly generating structures that fulfill predefined performance criteria. However, achievin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02607v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02607v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02607v1-abstract-full" style="display: none;"> The inverse design of microstructures plays a pivotal role in optimizing metamaterials with specific, targeted physical properties. While traditional forward design methods are constrained by their inability to explore the vast combinatorial design space, inverse design offers a compelling alternative by directly generating structures that fulfill predefined performance criteria. However, achieving precise control over both geometry and material properties remains a significant challenge due to their intricate interdependence. Existing approaches, which typically rely on voxel or parametric representations, often limit design flexibility and structural diversity. In this work, we present a novel generative model that integrates latent diffusion with Holoplane, an advanced hybrid neural representation that simultaneously encodes both geometric and physical properties. This combination ensures superior alignment between geometry and properties. Our approach generalizes across multiple microstructure classes, enabling the generation of diverse, tileable microstructures with significantly improved property accuracy and enhanced control over geometric validity, surpassing the performance of existing methods. We introduce a multi-class dataset encompassing a variety of geometric morphologies, including truss, shell, tube, and plate structures, to train and validate our model. Experimental results demonstrate the model's ability to generate microstructures that meet target properties, maintain geometric validity, and integrate seamlessly into complex assemblies. Additionally, we explore the potential of our framework through the generation of new microstructures, cross-class interpolation, and the infilling of heterogeneous microstructures. The dataset and source code will be open-sourced upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02607v1-abstract-full').style.display = 'none'; document.getElementById('2502.02607v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02295">arXiv:2502.02295</a> <span> [<a href="https://arxiv.org/pdf/2502.02295">pdf</a>, <a href="https://arxiv.org/ps/2502.02295">ps</a>, <a href="https://arxiv.org/format/2502.02295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Reflecting Surface Based Localization of Mixed Near-Field and Far-Field Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Weifeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/cs?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02295v1-abstract-short" style="display: inline;"> This paper considers an intelligent reflecting surface (IRS)-assisted bi-static localization architecture for the sixth-generation (6G) integrated sensing and communication (ISAC) network. The system consists of a transmit user, a receive base station (BS), an IRS, and multiple targets in either the far-field or near-field region of the IRS. In particular, we focus on the challenging scenario wher… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02295v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02295v1-abstract-full" style="display: none;"> This paper considers an intelligent reflecting surface (IRS)-assisted bi-static localization architecture for the sixth-generation (6G) integrated sensing and communication (ISAC) network. The system consists of a transmit user, a receive base station (BS), an IRS, and multiple targets in either the far-field or near-field region of the IRS. In particular, we focus on the challenging scenario where the line-of-sight (LOS) paths between targets and the BS are blocked, such that the emitted orthogonal frequency division multiplexing (OFDM) signals from the user reach the BS merely via the user-target-IRS-BS path. Based on the signals received by the BS, our goal is to localize the targets by estimating their relative positions to the IRS, instead of to the BS. We show that subspace-based methods, such as the multiple signal classification (MUSIC) algorithm, can be applied onto the BS's received signals to estimate the relative states from the targets to the IRS. To this end, we create a virtual signal via combining user-target-IRS-BS channels over various time slots. By applying MUSIC on such a virtual signal, we are able to detect the far-field targets and the near-field targets, and estimate the angle-of-arrivals (AOAs) and/or ranges from the targets to the IRS. Furthermore, we theoretically verify that the proposed method can perfectly estimate the relative states from the targets to the IRS in the ideal case with infinite coherence blocks. Numerical results verify the effectiveness of our proposed IRS-assisted localization scheme. Our paper demonstrates the potential of employing passive anchors, i.e., IRSs, to improve the sensing coverage of the active anchors, i.e., BSs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02295v1-abstract-full').style.display = 'none'; document.getElementById('2502.02295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01152">arXiv:2502.01152</a> <span> [<a href="https://arxiv.org/pdf/2502.01152">pdf</a>, <a href="https://arxiv.org/format/2502.01152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Gradient Norm-based Fine-Tuning for Backdoor Defense in Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+N">Nanjun Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weilin Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01152v1-abstract-short" style="display: inline;"> Backdoor attacks have posed a significant threat to the security of deep neural networks (DNNs). Despite considerable strides in developing defenses against backdoor attacks in the visual domain, the specialized defenses for the audio domain remain empty. Furthermore, the defenses adapted from the visual to audio domain demonstrate limited effectiveness. To fill this gap, we propose Gradient Norm-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01152v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01152v1-abstract-full" style="display: none;"> Backdoor attacks have posed a significant threat to the security of deep neural networks (DNNs). Despite considerable strides in developing defenses against backdoor attacks in the visual domain, the specialized defenses for the audio domain remain empty. Furthermore, the defenses adapted from the visual to audio domain demonstrate limited effectiveness. To fill this gap, we propose Gradient Norm-based FineTuning (GN-FT), a novel defense strategy against the attacks in the audio domain, based on the observation from the corresponding backdoored models. Specifically, we first empirically find that the backdoored neurons exhibit greater gradient values compared to other neurons, while clean neurons stay the lowest. On this basis, we fine-tune the backdoored model by incorporating the gradient norm regularization, aiming to weaken and reduce the backdoored neurons. We further approximate the loss computation for lower implementation costs. Extensive experiments on two speech recognition datasets across five models demonstrate the superior performance of our proposed method. To the best of our knowledge, this work is the first specialized and effective defense against backdoor attacks in the audio domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01152v1-abstract-full').style.display = 'none'; document.getElementById('2502.01152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures. This work has been accpeted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00843">arXiv:2502.00843</a> <span> [<a href="https://arxiv.org/pdf/2502.00843">pdf</a>, <a href="https://arxiv.org/format/2502.00843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VLM-Assisted Continual learning for Visual Question Answering in Self-Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuxin Lin</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengshi Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Huadong Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00843v1-abstract-short" style="display: inline;"> In this paper, we propose a novel approach for solving the Visual Question Answering (VQA) task in autonomous driving by integrating Vision-Language Models (VLMs) with continual learning. In autonomous driving, VQA plays a vital role in enabling the system to understand and reason about its surroundings. However, traditional models often struggle with catastrophic forgetting when sequentially expo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00843v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00843v1-abstract-full" style="display: none;"> In this paper, we propose a novel approach for solving the Visual Question Answering (VQA) task in autonomous driving by integrating Vision-Language Models (VLMs) with continual learning. In autonomous driving, VQA plays a vital role in enabling the system to understand and reason about its surroundings. However, traditional models often struggle with catastrophic forgetting when sequentially exposed to new driving tasks, such as perception, prediction, and planning, each requiring different forms of knowledge. To address this challenge, we present a novel continual learning framework that combines VLMs with selective memory replay and knowledge distillation, reinforced by task-specific projection layer regularization. The knowledge distillation allows a previously trained model to act as a "teacher" to guide the model through subsequent tasks, minimizing forgetting. Meanwhile, task-specific projection layers calculate the loss based on the divergence of feature representations, ensuring continuity in learning and reducing the shift between tasks. Evaluated on the DriveLM dataset, our framework shows substantial performance improvements, with gains ranging from 21.40% to 32.28% across various metrics. These results highlight the effectiveness of combining continual learning with VLMs in enhancing the resilience and reliability of VQA systems in autonomous driving. We will release our source code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00843v1-abstract-full').style.display = 'none'; document.getElementById('2502.00843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19243">arXiv:2501.19243</a> <span> [<a href="https://arxiv.org/pdf/2501.19243">pdf</a>, <a href="https://arxiv.org/format/2501.19243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Diffusion Transformer via Error-Optimized Cache </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Junxiang Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinda Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Houcheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yanbin Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19243v1-abstract-short" style="display: inline;"> Diffusion Transformer (DiT) is a crucial method for content generation. However, it needs a lot of time to sample. Many studies have attempted to use caching to reduce the time consumption of sampling. Existing caching methods accelerate generation by reusing DiT features from the previous time step and skipping calculations in the next, but they tend to locate and cache low-error modules without… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19243v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19243v1-abstract-full" style="display: none;"> Diffusion Transformer (DiT) is a crucial method for content generation. However, it needs a lot of time to sample. Many studies have attempted to use caching to reduce the time consumption of sampling. Existing caching methods accelerate generation by reusing DiT features from the previous time step and skipping calculations in the next, but they tend to locate and cache low-error modules without focusing on reducing caching-induced errors, resulting in a sharp decline in generated content quality when increasing caching intensity. To solve this problem, we propose the Error-Optimized Cache (EOC). This method introduces three key improvements: (1) Prior knowledge extraction: Extract and process the caching differences; (2) A judgment method for cache optimization: Determine whether certain caching steps need to be optimized; (3) Cache optimization: reduce caching errors. Experiments show that this algorithm significantly reduces the error accumulation caused by caching (especially over-caching). On the ImageNet dataset, without significantly increasing the computational burden, this method improves the quality of the generated images under the over-caching, rule-based, and training-based methods. Specifically, the Fr茅chet Inception Distance (FID) values are improved as follows: from 6.857 to 5.821, from 3.870 to 3.692 and form 3.539 to 3.451 respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19243v1-abstract-full').style.display = 'none'; document.getElementById('2501.19243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18122">arXiv:2501.18122</a> <span> [<a href="https://arxiv.org/pdf/2501.18122">pdf</a>, <a href="https://arxiv.org/format/2501.18122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VQLTI: Long-Term Tropical Cyclone Intensity Forecasting with Physical Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kang Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18122v1-abstract-short" style="display: inline;"> Tropical cyclone (TC) intensity forecasting is crucial for early disaster warning and emergency decision-making. Numerous researchers have explored deep-learning methods to address computational and post-processing issues in operational forecasting. Regrettably, they exhibit subpar long-term forecasting capabilities. We use two strategies to enhance long-term forecasting. (1) By enhancing the matc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18122v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18122v1-abstract-full" style="display: none;"> Tropical cyclone (TC) intensity forecasting is crucial for early disaster warning and emergency decision-making. Numerous researchers have explored deep-learning methods to address computational and post-processing issues in operational forecasting. Regrettably, they exhibit subpar long-term forecasting capabilities. We use two strategies to enhance long-term forecasting. (1) By enhancing the matching between TC intensity and spatial information, we can improve long-term forecasting performance. (2) Incorporating physical knowledge and physical constraints can help mitigate the accumulation of forecasting errors. To achieve the above strategies, we propose the VQLTI framework. VQLTI transfers the TC intensity information to a discrete latent space while retaining the spatial information differences, using large-scale spatial meteorological data as conditions. Furthermore, we leverage the forecast from the weather prediction model FengWu to provide additional physical knowledge for VQLTI. Additionally, we calculate the potential intensity (PI) to impose physical constraints on the latent variables. In the global long-term TC intensity forecasting, VQLTI achieves state-of-the-art results for the 24h to 120h, with the MSW (Maximum Sustained Wind) forecast error reduced by 35.65%-42.51% compared to ECMWF-IFS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18122v1-abstract-full').style.display = 'none'; document.getElementById('2501.18122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17433">arXiv:2501.17433</a> <span> [<a href="https://arxiv.org/pdf/2501.17433">pdf</a>, <a href="https://arxiv.org/format/2501.17433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Virus: Harmful Fine-tuning Attack for Large Language Models Bypassing Guardrail Moderation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiansheng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sihao Hu</a>, <a href="/search/cs?searchtype=author&query=Ilhan%2C+F">Fatih Ilhan</a>, <a href="/search/cs?searchtype=author&query=Tekin%2C+S+F">Selim Furkan Tekin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17433v1-abstract-short" style="display: inline;"> Recent research shows that Large Language Models (LLMs) are vulnerable to harmful fine-tuning attacks -- models lose their safety alignment ability after fine-tuning on a few harmful samples. For risk mitigation, a guardrail is typically used to filter out harmful samples before fine-tuning. By designing a new red-teaming method, we in this paper show that purely relying on the moderation guardrai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17433v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17433v1-abstract-full" style="display: none;"> Recent research shows that Large Language Models (LLMs) are vulnerable to harmful fine-tuning attacks -- models lose their safety alignment ability after fine-tuning on a few harmful samples. For risk mitigation, a guardrail is typically used to filter out harmful samples before fine-tuning. By designing a new red-teaming method, we in this paper show that purely relying on the moderation guardrail for data filtration is not reliable. Our proposed attack method, dubbed Virus, easily bypasses the guardrail moderation by slightly modifying the harmful data. Experimental results show that the harmful data optimized by Virus is not detectable by the guardrail with up to 100\% leakage ratio, and can simultaneously achieve superior attack performance. Finally, the key message we want to convey through this paper is that: \textbf{it is reckless to consider guardrail moderation as a clutch at straws towards harmful fine-tuning attack}, as it cannot solve the inherent safety issue of the pre-trained LLMs. Our code is available at https://github.com/git-disl/Virus <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17433v1-abstract-full').style.display = 'none'; document.getElementById('2501.17433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17167">arXiv:2501.17167</a> <span> [<a href="https://arxiv.org/pdf/2501.17167">pdf</a>, <a href="https://arxiv.org/format/2501.17167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QualityFlow: An Agentic Workflow for Program Synthesis Controlled by LLM Quality Checks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yaojie Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qihong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaopeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linbo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dejiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Kachroo%2C+A">Amit Kachroo</a>, <a href="/search/cs?searchtype=author&query=Oz%2C+T">Talha Oz</a>, <a href="/search/cs?searchtype=author&query=Tripp%2C+O">Omer Tripp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17167v1-abstract-short" style="display: inline;"> We introduce QualityFlow, a dynamic agentic workflow for program synthesis. Given the English description of a programming problem and a set of unit tests, the model's goal is to synthesize the correct program that solves the problem and passes the tests. QualityFlow consists of multiple large language model (LLM) agents that resemble a software development team, including code generation, testing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17167v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17167v1-abstract-full" style="display: none;"> We introduce QualityFlow, a dynamic agentic workflow for program synthesis. Given the English description of a programming problem and a set of unit tests, the model's goal is to synthesize the correct program that solves the problem and passes the tests. QualityFlow consists of multiple large language model (LLM) agents that resemble a software development team, including code generation, testing, and self-debugging. Existing program synthesis methods face three major limitations: assumption of visible unit test conformity, bottleneck of synthesized test quality, and deviation of self-debugging trajectory. To address them, we propose the LLM Quality Checker, which explicitly "imagines" whether the synthesized programs' execution would conform to the unit tests. The Quality Checks dynamically control the workflow, including actions to submit the final answer, clarify the problem statement, and revert previous workflow steps. As a result, our Quality Checker can precisely accept any correct program, mitigate faulty synthesized tests, and prevent potential workflow deviation. The success of the Quality Checker further enables Diversified Prompting, which encourages variations in LLM responses to maximize the possibility that a correct program appears and passes the quality check. In experiments, QualityFlow establishes the state-of-the-art results on four program synthesis benchmarks: MBPP, HumanEval, and the stricter evaluations of both MBPP and HumanEval from EvalPlus. Our systematic analysis shows that the dynamic workflow controlled by LLM quality checks can outperform static workflows and single-attempt zero-shot synthesis. The Quality Checker is the center of our investigation, and we dissect its individual performance and integrated impact on the workflow accuracy, as well as other ablations experiments to justify our workflow design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17167v1-abstract-full').style.display = 'none'; document.getElementById('2501.17167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16767">arXiv:2501.16767</a> <span> [<a href="https://arxiv.org/pdf/2501.16767">pdf</a>, <a href="https://arxiv.org/format/2501.16767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Target-driven Self-Distillation for Partial Observed Trajectories Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pengfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengshi Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Huadong Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16767v1-abstract-short" style="display: inline;"> Accurate prediction of future trajectories of traffic agents is essential for ensuring safe autonomous driving. However, partially observed trajectories can significantly degrade the performance of even state-of-the-art models. Previous approaches often rely on knowledge distillation to transfer features from fully observed trajectories to partially observed ones. This involves firstly training a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16767v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16767v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16767v1-abstract-full" style="display: none;"> Accurate prediction of future trajectories of traffic agents is essential for ensuring safe autonomous driving. However, partially observed trajectories can significantly degrade the performance of even state-of-the-art models. Previous approaches often rely on knowledge distillation to transfer features from fully observed trajectories to partially observed ones. This involves firstly training a fully observed model and then using a distillation process to create the final model. While effective, they require multi-stage training, making the training process very expensive. Moreover, knowledge distillation can lead to a performance degradation of the model. In this paper, we introduce a Target-driven Self-Distillation method (TSD) for motion forecasting. Our method leverages predicted accurate targets to guide the model in making predictions under partial observation conditions. By employing self-distillation, the model learns from the feature distributions of both fully observed and partially observed trajectories during a single end-to-end training process. This enhances the model's ability to predict motion accurately in both fully observed and partially observed scenarios. We evaluate our method on multiple datasets and state-of-the-art motion forecasting models. Extensive experimental results demonstrate that our approach achieves significant performance improvements in both settings. To facilitate further research, we will release our code and model checkpoints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16767v1-abstract-full').style.display = 'none'; document.getElementById('2501.16767v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16504">arXiv:2501.16504</a> <span> [<a href="https://arxiv.org/pdf/2501.16504">pdf</a>, <a href="https://arxiv.org/format/2501.16504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Digital Twin Enabled Site Specific Channel Precoding: Over the Air CIR Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haider%2C+M">Majumder Haider</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+I">Imtiaz Ahmed</a>, <a href="/search/cs?searchtype=author&query=Hassan%2C+Z">Zoheb Hassan</a>, <a href="/search/cs?searchtype=author&query=O%27Shea%2C+T+J">Timothy J. O'Shea</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjia Liu</a>, <a href="/search/cs?searchtype=author&query=Rawat%2C+D+B">Danda B. Rawat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16504v1-abstract-short" style="display: inline;"> This paper investigates the significance of designing a reliable, intelligent, and true physical environment-aware precoding scheme by leveraging an accurately designed channel twin model to obtain realistic channel state information (CSI) for cellular communication systems. Specifically, we propose a fine-tuned multi-step channel twin design process that can render CSI very close to the CSI of th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16504v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16504v1-abstract-full" style="display: none;"> This paper investigates the significance of designing a reliable, intelligent, and true physical environment-aware precoding scheme by leveraging an accurately designed channel twin model to obtain realistic channel state information (CSI) for cellular communication systems. Specifically, we propose a fine-tuned multi-step channel twin design process that can render CSI very close to the CSI of the actual environment. After generating a precise CSI, we execute precoding using the obtained CSI at the transmitter end. We demonstrate a two-step parameters' tuning approach to design channel twin by ray tracing (RT) emulation, then further fine-tuning of CSI by employing an artificial intelligence (AI) based algorithm can significantly reduce the gap between actual CSI and the fine-tuned digital twin (DT) rendered CSI. The simulation results show the effectiveness of the proposed novel approach in designing a true physical environment-aware channel twin model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16504v1-abstract-full').style.display = 'none'; document.getElementById('2501.16504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15925">arXiv:2501.15925</a> <span> [<a href="https://arxiv.org/pdf/2501.15925">pdf</a>, <a href="https://arxiv.org/format/2501.15925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Efficient Distillation of Deep Spiking Neural Networks for Full-Range Timestep Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chengting Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaochen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaoang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erping Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Aili Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15925v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) are emerging as a brain-inspired alternative to traditional Artificial Neural Networks (ANNs), prized for their potential energy efficiency on neuromorphic hardware. Despite this, SNNs often suffer from accuracy degradation compared to ANNs and face deployment challenges due to fixed inference timesteps, which require retraining for adjustments, limiting operational… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15925v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15925v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) are emerging as a brain-inspired alternative to traditional Artificial Neural Networks (ANNs), prized for their potential energy efficiency on neuromorphic hardware. Despite this, SNNs often suffer from accuracy degradation compared to ANNs and face deployment challenges due to fixed inference timesteps, which require retraining for adjustments, limiting operational flexibility. To address these issues, our work considers the spatio-temporal property inherent in SNNs, and proposes a novel distillation framework for deep SNNs that optimizes performance across full-range timesteps without specific retraining, enhancing both efficacy and deployment adaptability. We provide both theoretical analysis and empirical validations to illustrate that training guarantees the convergence of all implicit models across full-range timesteps. Experimental results on CIFAR-10, CIFAR-100, CIFAR10-DVS, and ImageNet demonstrate state-of-the-art performance among distillation-based SNNs training methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15925v1-abstract-full').style.display = 'none'; document.getElementById('2501.15925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15907">arXiv:2501.15907</a> <span> [<a href="https://arxiv.org/pdf/2501.15907">pdf</a>, <a href="https://arxiv.org/format/2501.15907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Emilia: A Large-Scale, Extensive, Multilingual, and Diverse Dataset for Speech Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Haorui He</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Z">Zengqiang Shang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoren Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuyuan Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yicheng Gu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+H">Hua Hua</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liwei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chen Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+P">Peiyang Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuancheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15907v1-abstract-short" style="display: inline;"> Recent advancements in speech generation have been driven by the large-scale training datasets. However, current models fall short of capturing the spontaneity and variability inherent in real-world human speech, due to their reliance on audiobook datasets limited to formal read-aloud speech styles. To bridge this gap, we introduce Emilia-Pipe, an open-source preprocessing pipeline to extract high… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15907v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15907v1-abstract-full" style="display: none;"> Recent advancements in speech generation have been driven by the large-scale training datasets. However, current models fall short of capturing the spontaneity and variability inherent in real-world human speech, due to their reliance on audiobook datasets limited to formal read-aloud speech styles. To bridge this gap, we introduce Emilia-Pipe, an open-source preprocessing pipeline to extract high-quality training data from valuable yet underexplored in-the-wild data that capture spontaneous human speech in real-world contexts. By leveraging Emilia-Pipe, we construct Emilia, the first multilingual speech generation dataset derived from in-the-wild speech data. This dataset comprises over 101k hours of speech across six languages: English, Chinese, German, French, Japanese, and Korean. Besides, we expand Emilia to Emilia-Large, a dataset exceeding 216k hours, making it the largest open-source speech generation dataset available. Extensive experiments demonstrate that Emilia significantly outperforms traditional audiobook datasets in generating spontaneous and human-like speech, showcasing superior performance in capturing diverse speaker timbre and speaking styles of real-world human speech. Furthermore, this work underscores the importance of scaling dataset size to advance speech generation research and validates the effectiveness of Emilia for both multilingual and crosslingual speech generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15907v1-abstract-full').style.display = 'none'; document.getElementById('2501.15907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of arXiv:2407.05361, submitted to TASLP, dataset is available at: https://huggingface.co/datasets/amphion/Emilia-Dataset</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15279">arXiv:2501.15279</a> <span> [<a href="https://arxiv.org/pdf/2501.15279">pdf</a>, <a href="https://arxiv.org/format/2501.15279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Polynomial 2D Biharmonic Coordinates for High-order Cages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shibo Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ligang Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiao-Ming Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15279v1-abstract-short" style="display: inline;"> We derive closed-form expressions of biharmonic coordinates for 2D high-order cages, enabling the transformation of the input polynomial curves into polynomial curves of any order. Central to our derivation is the use of the high-order boundary element method. We demonstrate the practicality and effectiveness of our method on various 2D deformations. In practice, users can easily manipulate the Be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15279v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15279v1-abstract-full" style="display: none;"> We derive closed-form expressions of biharmonic coordinates for 2D high-order cages, enabling the transformation of the input polynomial curves into polynomial curves of any order. Central to our derivation is the use of the high-order boundary element method. We demonstrate the practicality and effectiveness of our method on various 2D deformations. In practice, users can easily manipulate the Bezier control points to perform the desired intuitive deformation, as the biharmonic coordinates provide an enriched deformation space and encourage the alignment between the boundary cage and its interior geometry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15279v1-abstract-full').style.display = 'none'; document.getElementById('2501.15279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15144">arXiv:2501.15144</a> <span> [<a href="https://arxiv.org/pdf/2501.15144">pdf</a>, <a href="https://arxiv.org/format/2501.15144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Primitive Visual Measurement Understanding and the Role of Output Format in Learning in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yadav%2C+A">Ankit Yadav</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingqiao Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuankai Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15144v1-abstract-short" style="display: inline;"> This work investigates the capabilities of current vision-language models (VLMs) in visual understanding and attribute measurement of primitive shapes using a benchmark focused on controlled 2D shape configurations with variations in spatial positioning, occlusion, rotation, size, and shape attributes such as type, quadrant, center-coordinates, rotation, occlusion status, and color as shown in Fig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15144v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15144v1-abstract-full" style="display: none;"> This work investigates the capabilities of current vision-language models (VLMs) in visual understanding and attribute measurement of primitive shapes using a benchmark focused on controlled 2D shape configurations with variations in spatial positioning, occlusion, rotation, size, and shape attributes such as type, quadrant, center-coordinates, rotation, occlusion status, and color as shown in Figure 1 and supplementary Figures S3-S81. We fine-tune state-of-the-art VLMs (2B-8B parameters) using Low-Rank Adaptation (LoRA) and validate them on multiple out-of-domain (OD) scenarios from our proposed benchmark. Our findings reveal that coherent sentence-based outputs outperform tuple formats, particularly in OD scenarios with large domain gaps. Additionally, we demonstrate that scaling numeric tokens during loss computation enhances numerical approximation capabilities, further improving performance on spatial and measurement tasks. These results highlight the importance of output format design, loss scaling strategies, and robust generalization techniques in enhancing the training and fine-tuning of VLMs, particularly for tasks requiring precise spatial approximations and strong OD generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15144v1-abstract-full').style.display = 'none'; document.getElementById('2501.15144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13354">arXiv:2501.13354</a> <span> [<a href="https://arxiv.org/pdf/2501.13354">pdf</a>, <a href="https://arxiv.org/format/2501.13354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NUDT4MSTAR: A Large Dataset and Benchmark Towards Remote Sensing Object Recognition in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijie Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xuying Xiong</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Bowen Peng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yafei Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianpeng Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13354v2-abstract-short" style="display: inline;"> As an indispensable sensor for Remote sensing, Synthetic Aperture Radar (SAR) has a unique capability for all-day imaging. Nevertheless, in a data-driven era, the scarcity of large-scale datasets poses a significant bottleneck to advancing SAR automatic target recognition (ATR) technology. This paper introduces NUDT4MSTAR, a large-scale SAR dataset for remote sensing target recognition in the wild… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13354v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13354v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13354v2-abstract-full" style="display: none;"> As an indispensable sensor for Remote sensing, Synthetic Aperture Radar (SAR) has a unique capability for all-day imaging. Nevertheless, in a data-driven era, the scarcity of large-scale datasets poses a significant bottleneck to advancing SAR automatic target recognition (ATR) technology. This paper introduces NUDT4MSTAR, a large-scale SAR dataset for remote sensing target recognition in the wild, including 40 vehicle target types and various imaging conditions across 5 realistic scenes. NUDT4MSTAR represents a significant leap forward in dataset scale, containing over 190,000 images-tenfold the size of its predecessors. We meticulously annotate each image with detailed target information and imaging conditions. Besides, data in both processed magnitude images and original complex formats are provided. Then, we construct a comprehensive benchmark consisting of 7 experiments with 15 recognition methods focusing on the stable and effective ATR issues. Besides, we conduct transfer learning experiments utilizing various models training on NUDT4MSTAR and apply them to three other target datasets, demonstrating its substantial potential for the broader field of ground objects ATR. Finally, we discuss this dataset's application value and ATR's significant challenges. To the best of our knowledge, this work marks the first-ever endeavor to create a large-scale dataset benchmark for fine-grained SAR recognition in the wild, featuring an extensive collection of exhaustively annotated vehicle images. We expect that the open source of NUDT4MSTAR will facilitate the development of SAR ATR and attract a wider community of researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13354v2-abstract-full').style.display = 'none'; document.getElementById('2501.13354v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures; NUDT4MSTAR: https://github.com/waterdisappear/NUDT4MSTAR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12326">arXiv:2501.12326</a> <span> [<a href="https://arxiv.org/pdf/2501.12326">pdf</a>, <a href="https://arxiv.org/format/2501.12326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> UI-TARS: Pioneering Automated GUI Interaction with Native Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yujia Qin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yining Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junjie Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoming Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shihao Liang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Shizuo Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junda Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunxin Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shijue Huang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wanjun Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kuanye Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiale Yang</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yu Miao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Woyu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xu Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kai Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuang Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yaowei Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chaolin Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12326v1-abstract-short" style="display: inline;"> This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12326v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12326v1-abstract-full" style="display: none;"> This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks. Experiments demonstrate its superior performance: UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating perception, grounding, and GUI task execution. Notably, in the OSWorld benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15 steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld, UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of GUI screenshots for context-aware understanding of UI elements and precise captioning; (2) Unified Action Modeling, which standardizes actions into a unified space across platforms and achieves precise grounding and interaction through large-scale action traces; (3) System-2 Reasoning, which incorporates deliberate reasoning into multi-step decision making, involving multiple reasoning patterns such as task decomposition, reflection thinking, milestone recognition, etc. (4) Iterative Training with Reflective Online Traces, which addresses the data bottleneck by automatically collecting, filtering, and reflectively refining new interaction traces on hundreds of virtual machines. Through iterative training and reflection tuning, UI-TARS continuously learns from its mistakes and adapts to unforeseen situations with minimal human intervention. We also analyze the evolution path of GUI agents to guide the further development of this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12326v1-abstract-full').style.display = 'none'; document.getElementById('2501.12326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12135">arXiv:2501.12135</a> <span> [<a href="https://arxiv.org/pdf/2501.12135">pdf</a>, <a href="https://arxiv.org/ps/2501.12135">ps</a>, <a href="https://arxiv.org/format/2501.12135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Revisit the AWGN-goodness of Polar-like Lattices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Junjiang Yu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shanxiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+B">Baoming Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12135v2-abstract-short" style="display: inline;"> This paper aims to provide a comprehensive introduction to lattices constructed based on polar-like codes and demonstrate some of their key properties, such as AWGN goodness. We first present polar lattices directly from the perspective of their generator matrix. Next, we discuss their connection with the recently proposed PAC (polarization adjusted convolutional) lattices and analyze the structur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12135v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12135v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12135v2-abstract-full" style="display: none;"> This paper aims to provide a comprehensive introduction to lattices constructed based on polar-like codes and demonstrate some of their key properties, such as AWGN goodness. We first present polar lattices directly from the perspective of their generator matrix. Next, we discuss their connection with the recently proposed PAC (polarization adjusted convolutional) lattices and analyze the structural advantages of PAC lattices, through which the AWGN-goodness of PAC lattices can be conveniently demonstrated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12135v2-abstract-full').style.display = 'none'; document.getElementById('2501.12135v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11931">arXiv:2501.11931</a> <span> [<a href="https://arxiv.org/pdf/2501.11931">pdf</a>, <a href="https://arxiv.org/ps/2501.11931">ps</a>, <a href="https://arxiv.org/format/2501.11931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Construction of Simultaneously Good Polar Codes and Polar Lattices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruimin Yuan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shanxiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Cong Ling</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+B">Baoming Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11931v2-abstract-short" style="display: inline;"> In this work, we investigate the simultaneous goodness of polar codes and polar lattices. The simultaneous goodness of a lattice or a code means that it is optimal for both channel coding and source coding simultaneously. The existence of such kind of lattices was proven by using random lattice ensembles. Our work provides an explicit construction based on the polarization technique. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11931v2-abstract-full" style="display: none;"> In this work, we investigate the simultaneous goodness of polar codes and polar lattices. The simultaneous goodness of a lattice or a code means that it is optimal for both channel coding and source coding simultaneously. The existence of such kind of lattices was proven by using random lattice ensembles. Our work provides an explicit construction based on the polarization technique. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11931v2-abstract-full').style.display = 'none'; document.getElementById('2501.11931v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 3 figures, submitted to IEEE for publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11622">arXiv:2501.11622</a> <span> [<a href="https://arxiv.org/pdf/2501.11622">pdf</a>, <a href="https://arxiv.org/format/2501.11622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Causal Learning for Heterogeneous Subgroups Based on Nonlinear Causal Kernel Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lu Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiyu Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11622v3-abstract-short" style="display: inline;"> Due to the challenge posed by multi-source and heterogeneous data collected from diverse environments, causal relationships among features can exhibit variations influenced by different time spans, regions, or strategies. This diversity makes a single causal model inadequate for accurately representing complex causal relationships in all observational data, a crucial consideration in causal learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11622v3-abstract-full').style.display = 'inline'; document.getElementById('2501.11622v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11622v3-abstract-full" style="display: none;"> Due to the challenge posed by multi-source and heterogeneous data collected from diverse environments, causal relationships among features can exhibit variations influenced by different time spans, regions, or strategies. This diversity makes a single causal model inadequate for accurately representing complex causal relationships in all observational data, a crucial consideration in causal learning. To address this challenge, the nonlinear Causal Kernel Clustering method is introduced for heterogeneous subgroup causal learning, highlighting variations in causal relationships across diverse subgroups. The main component for clustering heterogeneous subgroups lies in the construction of the $u$-centered sample mapping function with the property of unbiased estimation, which assesses the differences in potential nonlinear causal relationships in various samples and supported by causal identifiability theory. Experimental results indicate that the method performs well in identifying heterogeneous subgroups and enhancing causal learning, leading to a reduction in prediction error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11622v3-abstract-full').style.display = 'none'; document.getElementById('2501.11622v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10917">arXiv:2501.10917</a> <span> [<a href="https://arxiv.org/pdf/2501.10917">pdf</a>, <a href="https://arxiv.org/format/2501.10917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Decomposing and Fusing Intra- and Inter-Sensor Spatio-Temporal Signal for Multi-Sensor Wearable Human Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+H">Haoyu Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoxuan Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haonan Yuan</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+G">Guorui Liao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jun Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10917v1-abstract-short" style="display: inline;"> Wearable Human Activity Recognition (WHAR) is a prominent research area within ubiquitous computing. Multi-sensor synchronous measurement has proven to be more effective for WHAR than using a single sensor. However, existing WHAR methods use shared convolutional kernels for indiscriminate temporal feature extraction across each sensor variable, which fails to effectively capture spatio-temporal re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10917v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10917v1-abstract-full" style="display: none;"> Wearable Human Activity Recognition (WHAR) is a prominent research area within ubiquitous computing. Multi-sensor synchronous measurement has proven to be more effective for WHAR than using a single sensor. However, existing WHAR methods use shared convolutional kernels for indiscriminate temporal feature extraction across each sensor variable, which fails to effectively capture spatio-temporal relationships of intra-sensor and inter-sensor variables. We propose the DecomposeWHAR model consisting of a decomposition phase and a fusion phase to better model the relationships between modality variables. The decomposition creates high-dimensional representations of each intra-sensor variable through the improved Depth Separable Convolution to capture local temporal features while preserving their unique characteristics. The fusion phase begins by capturing relationships between intra-sensor variables and fusing their features at both the channel and variable levels. Long-range temporal dependencies are modeled using the State Space Model (SSM), and later cross-sensor interactions are dynamically captured through a self-attention mechanism, highlighting inter-sensor spatial correlations. Our model demonstrates superior performance on three widely used WHAR datasets, significantly outperforming state-of-the-art models while maintaining acceptable computational efficiency. Our codes and supplementary materials are available at https://github.com/Anakin2555/DecomposeWHAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10917v1-abstract-full').style.display = 'none'; document.getElementById('2501.10917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10453">arXiv:2501.10453</a> <span> [<a href="https://arxiv.org/pdf/2501.10453">pdf</a>, <a href="https://arxiv.org/format/2501.10453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Uncovering Bias in Foundation Models: Impact, Testing, Harm, and Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shuzhou Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuanghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Heikkil%C3%A4%2C+J">Janne Heikkil盲</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10453v1-abstract-short" style="display: inline;"> Bias in Foundation Models (FMs) - trained on vast datasets spanning societal and historical knowledge - poses significant challenges for fairness and equity across fields such as healthcare, education, and finance. These biases, rooted in the overrepresentation of stereotypes and societal inequalities in training data, exacerbate real-world discrimination, reinforce harmful stereotypes, and erode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10453v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10453v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10453v1-abstract-full" style="display: none;"> Bias in Foundation Models (FMs) - trained on vast datasets spanning societal and historical knowledge - poses significant challenges for fairness and equity across fields such as healthcare, education, and finance. These biases, rooted in the overrepresentation of stereotypes and societal inequalities in training data, exacerbate real-world discrimination, reinforce harmful stereotypes, and erode trust in AI systems. To address this, we introduce Trident Probe Testing (TriProTesting), a systematic testing method that detects explicit and implicit biases using semantically designed probes. Here we show that FMs, including CLIP, ALIGN, BridgeTower, and OWLv2, demonstrate pervasive biases across single and mixed social attributes (gender, race, age, and occupation). Notably, we uncover mixed biases when social attributes are combined, such as gender x race, gender x age, and gender x occupation, revealing deeper layers of discrimination. We further propose Adaptive Logit Adjustment (AdaLogAdjustment), a post-processing technique that dynamically redistributes probability power to mitigate these biases effectively, achieving significant improvements in fairness without retraining models. These findings highlight the urgent need for ethical AI practices and interdisciplinary solutions to address biases not only at the model level but also in societal structures. Our work provides a scalable and interpretable solution that advances fairness in AI systems while offering practical insights for future research on fair AI technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10453v1-abstract-full').style.display = 'none'; document.getElementById('2501.10453v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09757">arXiv:2501.09757</a> <span> [<a href="https://arxiv.org/pdf/2501.09757">pdf</a>, <a href="https://arxiv.org/format/2501.09757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Distilling Multi-modal Large Language Models for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hegde%2C+D">Deepti Hegde</a>, <a href="/search/cs?searchtype=author&query=Yasarla%2C+R">Rajeev Yasarla</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hong Cai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shizhong Han</a>, <a href="/search/cs?searchtype=author&query=Bhattacharyya%2C+A">Apratim Bhattacharyya</a>, <a href="/search/cs?searchtype=author&query=Mahajan%2C+S">Shweta Mahajan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Litian Liu</a>, <a href="/search/cs?searchtype=author&query=Garrepalli%2C+R">Risheek Garrepalli</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+V+M">Vishal M. Patel</a>, <a href="/search/cs?searchtype=author&query=Porikli%2C+F">Fatih Porikli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09757v1-abstract-short" style="display: inline;"> Autonomous driving demands safe motion planning, especially in critical "long-tail" scenarios. Recent end-to-end autonomous driving systems leverage large language models (LLMs) as planners to improve generalizability to rare events. However, using LLMs at test time introduces high computational costs. To address this, we propose DiMA, an end-to-end autonomous driving system that maintains the eff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09757v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09757v1-abstract-full" style="display: none;"> Autonomous driving demands safe motion planning, especially in critical "long-tail" scenarios. Recent end-to-end autonomous driving systems leverage large language models (LLMs) as planners to improve generalizability to rare events. However, using LLMs at test time introduces high computational costs. To address this, we propose DiMA, an end-to-end autonomous driving system that maintains the efficiency of an LLM-free (or vision-based) planner while leveraging the world knowledge of an LLM. DiMA distills the information from a multi-modal LLM to a vision-based end-to-end planner through a set of specially designed surrogate tasks. Under a joint training strategy, a scene encoder common to both networks produces structured representations that are semantically grounded as well as aligned to the final planning objective. Notably, the LLM is optional at inference, enabling robust planning without compromising on efficiency. Training with DiMA results in a 37% reduction in the L2 trajectory error and an 80% reduction in the collision rate of the vision-based planner, as well as a 44% trajectory error reduction in longtail scenarios. DiMA also achieves state-of-the-art performance on the nuScenes planning benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09757v1-abstract-full').style.display = 'none'; document.getElementById('2501.09757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09274">arXiv:2501.09274</a> <span> [<a href="https://arxiv.org/pdf/2501.09274">pdf</a>, <a href="https://arxiv.org/format/2501.09274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model is Secretly a Protein Sequence Optimizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinkai Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiaxing He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuanqi Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaohui Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J+C">Jianan Canal Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li-Ping Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolin Xu</a>, <a href="/search/cs?searchtype=author&query=Hassoun%2C+S">Soha Hassoun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09274v2-abstract-short" style="display: inline;"> We consider the protein sequence engineering problem, which aims to find protein sequences with high fitness levels, starting from a given wild-type sequence. Directed evolution has been a dominating paradigm in this field which has an iterative process to generate variants and select via experimental feedback. We demonstrate large language models (LLMs), despite being trained on massive texts, ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09274v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09274v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09274v2-abstract-full" style="display: none;"> We consider the protein sequence engineering problem, which aims to find protein sequences with high fitness levels, starting from a given wild-type sequence. Directed evolution has been a dominating paradigm in this field which has an iterative process to generate variants and select via experimental feedback. We demonstrate large language models (LLMs), despite being trained on massive texts, are secretly protein sequence optimizers. With a directed evolutionary method, LLM can perform protein engineering through Pareto and experiment-budget constrained optimization, demonstrating success on both synthetic and experimental fitness landscapes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09274v2-abstract-full').style.display = 'none'; document.getElementById('2501.09274v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08643">arXiv:2501.08643</a> <span> [<a href="https://arxiv.org/pdf/2501.08643">pdf</a>, <a href="https://arxiv.org/format/2501.08643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MonSter: Marry Monodepth to Stereo Unleashes Power </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junda Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longliang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Gangwei Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yong Deng</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+J">Jinliang Zang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yurui Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhipeng Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08643v1-abstract-short" style="display: inline;"> Stereo matching recovers depth from image correspondences. Existing methods struggle to handle ill-posed regions with limited matching cues, such as occlusions and textureless areas. To address this, we propose MonSter, a novel method that leverages the complementary strengths of monocular depth estimation and stereo matching. MonSter integrates monocular depth and stereo matching into a dual-bran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08643v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08643v1-abstract-full" style="display: none;"> Stereo matching recovers depth from image correspondences. Existing methods struggle to handle ill-posed regions with limited matching cues, such as occlusions and textureless areas. To address this, we propose MonSter, a novel method that leverages the complementary strengths of monocular depth estimation and stereo matching. MonSter integrates monocular depth and stereo matching into a dual-branch architecture to iteratively improve each other. Confidence-based guidance adaptively selects reliable stereo cues for monodepth scale-shift recovery. The refined monodepth is in turn guides stereo effectively at ill-posed regions. Such iterative mutual enhancement enables MonSter to evolve monodepth priors from coarse object-level structures to pixel-level geometry, fully unlocking the potential of stereo matching. As shown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards -- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to 49.5% improvements (Bad 1.0 on ETH3D) over the previous best method. Comprehensive analysis verifies the effectiveness of MonSter in ill-posed regions. In terms of zero-shot generalization, MonSter significantly and consistently outperforms state-of-the-art across the board. The code is publicly available at: https://github.com/Junda24/MonSter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08643v1-abstract-full').style.display = 'none'; document.getElementById('2501.08643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08520">arXiv:2501.08520</a> <span> [<a href="https://arxiv.org/pdf/2501.08520">pdf</a>, <a href="https://arxiv.org/format/2501.08520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Chance-Constrained Sampling-Based MPC for Collision Avoidance in Uncertain Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mohamed%2C+I+S">Ihab S. Mohamed</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+M">Mahmoud Ali</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lantao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08520v1-abstract-short" style="display: inline;"> Navigating safely in dynamic and uncertain environments is challenging due to uncertainties in perception and motion. This letter presents C2U-MPPI, a robust sampling-based Model Predictive Control (MPC) framework that addresses these challenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI) control strategy with integrated probabilistic chance constraints, ensuring more relia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08520v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08520v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08520v1-abstract-full" style="display: none;"> Navigating safely in dynamic and uncertain environments is challenging due to uncertainties in perception and motion. This letter presents C2U-MPPI, a robust sampling-based Model Predictive Control (MPC) framework that addresses these challenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI) control strategy with integrated probabilistic chance constraints, ensuring more reliable and efficient navigation under uncertainty. Unlike gradient-based MPC methods, our approach (i) avoids linearization of system dynamics and directly applies non-convex and nonlinear chance constraints, enabling more accurate and flexible optimization, and (ii) enhances computational efficiency by reformulating probabilistic constraints into a deterministic form and employing a layered dynamic obstacle representation, enabling real-time handling of multiple obstacles. Extensive experiments in simulated and real-world human-shared environments validate the effectiveness of our algorithm against baseline methods, showcasing its capability to generate feasible trajectories and control inputs that adhere to system dynamics and constraints in dynamic settings, enabled by unscented-based sampling strategy and risk-sensitive trajectory evaluation. A supplementary video is available at: https://youtu.be/FptAhvJlQm8 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08520v1-abstract-full').style.display = 'none'; document.getElementById('2501.08520v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has 8 pages, 2 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08109">arXiv:2501.08109</a> <span> [<a href="https://arxiv.org/pdf/2501.08109">pdf</a>, <a href="https://arxiv.org/format/2501.08109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Data-driven inventory management for new products: A warm-start and adjusted Dyna-$Q$ approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xinye Qu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longxiao Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08109v2-abstract-short" style="display: inline;"> In this paper, we propose a novel reinforcement learning algorithm for inventory management of newly launched products with no or limited historical demand information. The algorithm follows the classic Dyna-$Q$ structure, balancing the model-based and model-free approaches, while accelerating the training process of Dyna-$Q$ and mitigating the model discrepancy generated by the model-based feedba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08109v2-abstract-full').style.display = 'inline'; document.getElementById('2501.08109v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08109v2-abstract-full" style="display: none;"> In this paper, we propose a novel reinforcement learning algorithm for inventory management of newly launched products with no or limited historical demand information. The algorithm follows the classic Dyna-$Q$ structure, balancing the model-based and model-free approaches, while accelerating the training process of Dyna-$Q$ and mitigating the model discrepancy generated by the model-based feedback. Warm-start information from the demand data of existing similar products can be incorporated into the algorithm to further stabilize the early-stage training and reduce the variance of the estimated optimal policy. Our approach is validated through a case study of bakery inventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7% reduction in average daily cost compared with $Q$-learning, and up to a 77.5% reduction in training time within the same horizon compared with classic Dyna-$Q$. By incorporating the warm-start information, it can be found that the adjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and relatively low shortage percentages among all the algorithms under a 30-day testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08109v2-abstract-full').style.display = 'none'; document.getElementById('2501.08109v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08001">arXiv:2501.08001</a> <span> [<a href="https://arxiv.org/pdf/2501.08001">pdf</a>, <a href="https://arxiv.org/format/2501.08001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GDiffRetro: Retrosynthesis Prediction with Dual Graph Enhanced Molecular Representation and Diffusion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shengyin Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuxiang Ren</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Weitao Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuecang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Ying Hu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08001v1-abstract-short" style="display: inline;"> Retrosynthesis prediction focuses on identifying reactants capable of synthesizing a target product. Typically, the retrosynthesis prediction involves two phases: Reaction Center Identification and Reactant Generation. However, we argue that most existing methods suffer from two limitations in the two phases: (i) Existing models do not adequately capture the ``face'' information in molecular graph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08001v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08001v1-abstract-full" style="display: none;"> Retrosynthesis prediction focuses on identifying reactants capable of synthesizing a target product. Typically, the retrosynthesis prediction involves two phases: Reaction Center Identification and Reactant Generation. However, we argue that most existing methods suffer from two limitations in the two phases: (i) Existing models do not adequately capture the ``face'' information in molecular graphs for the reaction center identification. (ii) Current approaches for the reactant generation predominantly use sequence generation in a 2D space, which lacks versatility in generating reasonable distributions for completed reactive groups and overlooks molecules' inherent 3D properties. To overcome the above limitations, we propose GDiffRetro. For the reaction center identification, GDiffRetro uniquely integrates the original graph with its corresponding dual graph to represent molecular structures, which helps guide the model to focus more on the faces in the graph. For the reactant generation, GDiffRetro employs a conditional diffusion model in 3D to further transform the obtained synthon into a complete reactant. Our experimental findings reveal that GDiffRetro outperforms state-of-the-art semi-template models across various evaluative metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08001v1-abstract-full').style.display = 'none'; document.getElementById('2501.08001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07870">arXiv:2501.07870</a> <span> [<a href="https://arxiv.org/pdf/2501.07870">pdf</a>, <a href="https://arxiv.org/format/2501.07870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Make-A-Character 2: Animatable 3D Character Generation From a Single Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yutong Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiahao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianfang Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tangli Xue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Longlong Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jianqiang Ren</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07870v2-abstract-short" style="display: inline;"> This report introduces Make-A-Character 2, an advanced system for generating high-quality 3D characters from single portrait photographs, ideal for game development and digital human applications. Make-A-Character 2 builds upon its predecessor by incorporating several significant improvements for image-based head generation. We utilize the IC-Light method to correct non-ideal illumination in input… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07870v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07870v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07870v2-abstract-full" style="display: none;"> This report introduces Make-A-Character 2, an advanced system for generating high-quality 3D characters from single portrait photographs, ideal for game development and digital human applications. Make-A-Character 2 builds upon its predecessor by incorporating several significant improvements for image-based head generation. We utilize the IC-Light method to correct non-ideal illumination in input photos and apply neural network-based color correction to harmonize skin tones between the photos and game engine renders. We also employ the Hierarchical Representation Network to capture high-frequency facial structures and conduct adaptive skeleton calibration for accurate and expressive facial animations. The entire image-to-3D-character generation process takes less than 2 minutes. Furthermore, we leverage transformer architecture to generate co-speech facial and gesture actions, enabling real-time conversation with the generated character. These technologies have been integrated into our conversational AI avatar products. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07870v2-abstract-full').style.display = 'none'; document.getElementById('2501.07870v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07563">arXiv:2501.07563</a> <span> [<a href="https://arxiv.org/pdf/2501.07563">pdf</a>, <a href="https://arxiv.org/format/2501.07563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Training-Free Motion-Guided Video Generation with Enhanced Temporal Consistency Using Motion Consistency Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zicheng Duan</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+D">Dong Gong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingqiao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07563v1-abstract-short" style="display: inline;"> In this paper, we address the challenge of generating temporally consistent videos with motion guidance. While many existing methods depend on additional control modules or inference-time fine-tuning, recent studies suggest that effective motion guidance is achievable without altering the model architecture or requiring extra training. Such approaches offer promising compatibility with various vid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07563v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07563v1-abstract-full" style="display: none;"> In this paper, we address the challenge of generating temporally consistent videos with motion guidance. While many existing methods depend on additional control modules or inference-time fine-tuning, recent studies suggest that effective motion guidance is achievable without altering the model architecture or requiring extra training. Such approaches offer promising compatibility with various video generation foundation models. However, existing training-free methods often struggle to maintain consistent temporal coherence across frames or to follow guided motion accurately. In this work, we propose a simple yet effective solution that combines an initial-noise-based approach with a novel motion consistency loss, the latter being our key innovation. Specifically, we capture the inter-frame feature correlation patterns of intermediate features from a video diffusion model to represent the motion pattern of the reference video. We then design a motion consistency loss to maintain similar feature correlation patterns in the generated video, using the gradient of this loss in the latent space to guide the generation process for precise motion control. This approach improves temporal consistency across various motion control tasks while preserving the benefits of a training-free setup. Extensive experiments show that our method sets a new standard for efficient, temporally coherent video generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07563v1-abstract-full').style.display = 'none'; document.getElementById('2501.07563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://zhangxinyu-xyz.github.io/SimulateMotion.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07063">arXiv:2501.07063</a> <span> [<a href="https://arxiv.org/pdf/2501.07063">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Research on the Online Update Method for Retrieval-Augmented Generation (RAG) Model with Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuxin Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xirui Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+N">Na Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zidong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07063v1-abstract-short" style="display: inline;"> In the contemporary context of rapid advancements in information technology and the exponential growth of data volume, language models are confronted with significant challenges in effectively navigating the dynamic and ever-evolving information landscape to update and adapt to novel knowledge in real time. In this work, an online update method is proposed, which is based on the existing Retrieval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07063v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07063v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07063v1-abstract-full" style="display: none;"> In the contemporary context of rapid advancements in information technology and the exponential growth of data volume, language models are confronted with significant challenges in effectively navigating the dynamic and ever-evolving information landscape to update and adapt to novel knowledge in real time. In this work, an online update method is proposed, which is based on the existing Retrieval Enhanced Generation (RAG) model with multiple innovation mechanisms. Firstly, the dynamic memory is used to capture the emerging data samples, and then gradually integrate them into the core model through a tunable knowledge distillation strategy. At the same time, hierarchical indexing and multi-layer gating mechanism are introduced into the retrieval module to ensure that the retrieved content is more targeted and accurate. Finally, a multi-stage network structure is established for different types of inputs in the generation stage, and cross-attention matching and screening are carried out on the intermediate representations of each stage to ensure the effective integration and iterative update of new and old knowledge. Experimental results show that the proposed method is better than the existing mainstream comparison models in terms of knowledge retention and inference accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07063v1-abstract-full').style.display = 'none'; document.getElementById('2501.07063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06842">arXiv:2501.06842</a> <span> [<a href="https://arxiv.org/pdf/2501.06842">pdf</a>, <a href="https://arxiv.org/format/2501.06842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SPAM: Spike-Aware Adam with Momentum Reset for Stable LLM Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tianjin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Ziquan Zhu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+G">Gaojie Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06842v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated exceptional performance across diverse tasks, yet their training remains highly resource-intensive and susceptible to critical challenges such as training instability. A predominant source of this instability stems from gradient and loss spikes, which disrupt the learning process, often leading to costly interventions like checkpoint recovery and expe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06842v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06842v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated exceptional performance across diverse tasks, yet their training remains highly resource-intensive and susceptible to critical challenges such as training instability. A predominant source of this instability stems from gradient and loss spikes, which disrupt the learning process, often leading to costly interventions like checkpoint recovery and experiment restarts, further amplifying inefficiencies. This paper presents a comprehensive investigation into gradient spikes observed during LLM training, revealing their prevalence across multiple architectures and datasets. Our analysis shows that these spikes can be up to $1000\times$ larger than typical gradients, substantially deteriorating model performance. To address this issue, we propose Spike-Aware Adam with Momentum Reset SPAM, a novel optimizer designed to counteract gradient spikes through momentum reset and spike-aware gradient clipping. Extensive experiments, including both pre-training and fine-tuning, demonstrate that SPAM consistently surpasses Adam and its variants across various tasks, including (1) LLM pre-training from 60M to 1B, (2) 4-bit LLM pre-training,(3) reinforcement learning, and (4) Time Series Forecasting. Additionally, SPAM facilitates memory-efficient training by enabling sparse momentum, where only a subset of momentum terms are maintained and updated. When operating under memory constraints, SPAM outperforms state-of-the-art memory-efficient optimizers such as GaLore and Adam-Mini. Our work underscores the importance of mitigating gradient spikes in LLM training and introduces an effective optimization strategy that enhances both training stability and resource efficiency at scale. Code is available at https://github.com/TianjinYellow/SPAM-Optimizer.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06842v1-abstract-full').style.display = 'none'; document.getElementById('2501.06842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06647">arXiv:2501.06647</a> <span> [<a href="https://arxiv.org/pdf/2501.06647">pdf</a>, <a href="https://arxiv.org/format/2501.06647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.14778/3704965.3704980">10.14778/3704965.3704980 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TUCKET: A Tensor Time Series Data Structure for Efficient and Accurate Factor Analysis over Time Ranges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+R">Ruizhong Qiu</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Jun-Gi Jang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiao Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lihui Liu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06647v1-abstract-short" style="display: inline;"> Tucker decomposition has been widely used in a variety of applications to obtain latent factors of tensor data. In these applications, a common need is to compute Tucker decomposition for a given time range. Furthermore, real-world tensor time series are typically evolving in the time dimension. Such needs call for a data structure that can efficiently and accurately support range queries of Tucke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06647v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06647v1-abstract-full" style="display: none;"> Tucker decomposition has been widely used in a variety of applications to obtain latent factors of tensor data. In these applications, a common need is to compute Tucker decomposition for a given time range. Furthermore, real-world tensor time series are typically evolving in the time dimension. Such needs call for a data structure that can efficiently and accurately support range queries of Tucker decomposition and stream updates. Unfortunately, existing methods do not support either range queries or stream updates. This challenging problem has remained open for years prior to our work. To solve this challenging problem, we propose TUCKET, a data structure that can efficiently and accurately handle both range queries and stream updates. Our key idea is to design a new data structure that we call a stream segment tree by generalizing the segment tree, a data structure that was originally invented for computational geometry. For a range query of length $L$, our TUCKET can find $O(\log L)$ nodes (called the hit set) from the tree and efficiently stitch their preprocessed decompositions to answer the range query. We also propose an algorithm to optimally prune the hit set via an approximation of subtensor decomposition. For the $T$-th stream update, our TUCKET modifies only amortized $O(1)$ nodes and only $O(\log T)$ nodes in the worst case. Extensive evaluation demonstrates that our TUCKET consistently achieves the highest efficiency and accuracy across four large-scale datasets. Our TUCKET achieves at least 3 times lower latency and at least 1.4 times smaller reconstruction error than Zoom-Tucker on all datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06647v1-abstract-full').style.display = 'none'; document.getElementById('2501.06647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at VLDB 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06429">arXiv:2501.06429</a> <span> [<a href="https://arxiv.org/pdf/2501.06429">pdf</a>, <a href="https://arxiv.org/format/2501.06429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Reliable Imputed-Sample Assisted Vertical Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yaopei Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaoguo Liu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+H">Hongjian Dou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06429v1-abstract-short" style="display: inline;"> Vertical Federated Learning (VFL) is a well-known FL variant that enables multiple parties to collaboratively train a model without sharing their raw data. Existing VFL approaches focus on overlapping samples among different parties, while their performance is constrained by the limited number of these samples, leaving numerous non-overlapping samples unexplored. Some previous work has explored te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06429v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06429v1-abstract-full" style="display: none;"> Vertical Federated Learning (VFL) is a well-known FL variant that enables multiple parties to collaboratively train a model without sharing their raw data. Existing VFL approaches focus on overlapping samples among different parties, while their performance is constrained by the limited number of these samples, leaving numerous non-overlapping samples unexplored. Some previous work has explored techniques for imputing missing values in samples, but often without adequate attention to the quality of the imputed samples. To address this issue, we propose a Reliable Imputed-Sample Assisted (RISA) VFL framework to effectively exploit non-overlapping samples by selecting reliable imputed samples for training VFL models. Specifically, after imputing non-overlapping samples, we introduce evidence theory to estimate the uncertainty of imputed samples, and only samples with low uncertainty are selected. In this way, high-quality non-overlapping samples are utilized to improve VFL model. Experiments on two widely used datasets demonstrate the significant performance gains achieved by the RISA, especially with the limited overlapping samples, e.g., a 48% accuracy gain on CIFAR-10 with only 1% overlapping samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06429v1-abstract-full').style.display = 'none'; document.getElementById('2501.06429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05427">arXiv:2501.05427</a> <span> [<a href="https://arxiv.org/pdf/2501.05427">pdf</a>, <a href="https://arxiv.org/format/2501.05427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-1-to-G: Taming Pretrained 2D Diffusion Model for Direct 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xuyi Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jiahui Lei</a>, <a href="/search/cs?searchtype=author&query=Daniilidis%2C+K">Kostas Daniilidis</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiatao Gu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05427v1-abstract-short" style="display: inline;"> Recent advances in 2D image generation have achieved remarkable quality,largely driven by the capacity of diffusion models and the availability of large-scale datasets. However, direct 3D generation is still constrained by the scarcity and lower fidelity of 3D datasets. In this paper, we introduce Zero-1-to-G, a novel approach that addresses this problem by enabling direct single-view generation o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05427v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05427v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05427v1-abstract-full" style="display: none;"> Recent advances in 2D image generation have achieved remarkable quality,largely driven by the capacity of diffusion models and the availability of large-scale datasets. However, direct 3D generation is still constrained by the scarcity and lower fidelity of 3D datasets. In this paper, we introduce Zero-1-to-G, a novel approach that addresses this problem by enabling direct single-view generation on Gaussian splats using pretrained 2D diffusion models. Our key insight is that Gaussian splats, a 3D representation, can be decomposed into multi-view images encoding different attributes. This reframes the challenging task of direct 3D generation within a 2D diffusion framework, allowing us to leverage the rich priors of pretrained 2D diffusion models. To incorporate 3D awareness, we introduce cross-view and cross-attribute attention layers, which capture complex correlations and enforce 3D consistency across generated splats. This makes Zero-1-to-G the first direct image-to-3D generative model to effectively utilize pretrained 2D diffusion priors, enabling efficient training and improved generalization to unseen objects. Extensive experiments on both synthetic and in-the-wild datasets demonstrate superior performance in 3D object generation, offering a new approach to high-quality 3D generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05427v1-abstract-full').style.display = 'none'; document.getElementById('2501.05427v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Liu%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Liu%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>