CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,367 results for author: <span class="mathjax">Li, K</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+K">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, K"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+K&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, K"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10248">arXiv:2502.10248</a> <span> [<a href="https://arxiv.org/pdf/2502.10248">pdf</a>, <a href="https://arxiv.org/format/2502.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqing Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kun Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengming Yin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Ranchen Ming</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoniu Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Deshan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Kaijun Tan</a>, <a href="/search/cs?searchtype=author&query=An%2C+K">Kang An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiling Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wen Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xin Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanan Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zheng Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a> , et al. (90 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10248v1-abstract-short" style="display: inline;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10248v1-abstract-full" style="display: none;"> We present Step-Video-T2V, a state-of-the-art text-to-video pre-trained model with 30B parameters and the ability to generate videos up to 204 frames in length. A deep compression Variational Autoencoder, Video-VAE, is designed for video generation tasks, achieving 16x16 spatial and 8x temporal compression ratios, while maintaining exceptional video reconstruction quality. User prompts are encoded using two bilingual text encoders to handle both English and Chinese. A DiT with 3D full attention is trained using Flow Matching and is employed to denoise input noise into latent frames. A video-based DPO approach, Video-DPO, is applied to reduce artifacts and improve the visual quality of the generated videos. We also detail our training strategies and share key observations and insights. Step-Video-T2V's performance is evaluated on a novel video generation benchmark, Step-Video-T2V-Eval, demonstrating its state-of-the-art text-to-video quality when compared with both open-source and commercial engines. Additionally, we discuss the limitations of current diffusion-based model paradigm and outline future directions for video foundation models. We make both Step-Video-T2V and Step-Video-T2V-Eval available at https://github.com/stepfun-ai/Step-Video-T2V. The online version can be accessed from https://yuewen.cn/videos as well. Our goal is to accelerate the innovation of video foundation models and empower video content creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10248v1-abstract-full').style.display = 'none'; document.getElementById('2502.10248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09977">arXiv:2502.09977</a> <span> [<a href="https://arxiv.org/pdf/2502.09977">pdf</a>, <a href="https://arxiv.org/format/2502.09977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs - No Silver Bullet for LC or RAG Routing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yong Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+P">Pengjun Xie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Minhao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09977v1-abstract-short" style="display: inline;"> Effectively incorporating external knowledge into Large Language Models (LLMs) is crucial for enhancing their capabilities and addressing real-world needs. Retrieval-Augmented Generation (RAG) offers an effective method for achieving this by retrieving the most relevant fragments into LLMs. However, the advancements in context window size for LLMs offer an alternative approach, raising the questio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09977v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09977v1-abstract-full" style="display: none;"> Effectively incorporating external knowledge into Large Language Models (LLMs) is crucial for enhancing their capabilities and addressing real-world needs. Retrieval-Augmented Generation (RAG) offers an effective method for achieving this by retrieving the most relevant fragments into LLMs. However, the advancements in context window size for LLMs offer an alternative approach, raising the question of whether RAG remains necessary for effectively handling external knowledge. Several existing studies provide inconclusive comparisons between RAG and long-context (LC) LLMs, largely due to limitations in the benchmark designs. In this paper, we present LaRA, a novel benchmark specifically designed to rigorously compare RAG and LC LLMs. LaRA encompasses 2,326 test cases across four practical QA task categories and three types of naturally occurring long texts. Through systematic evaluation of seven open-source and four proprietary LLMs, we find that the optimal choice between RAG and LC depends on a complex interplay of factors, including the model's parameter size, long-text capabilities, context length, task type, and the characteristics of the retrieved chunks. Our findings provide actionable guidelines for practitioners to effectively leverage both RAG and LC approaches in developing and deploying LLM applications. Our code and dataset is provided at: \href{https://github.com/likuanppd/LaRA}{\textbf{https://github.com/likuanppd/LaRA}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09977v1-abstract-full').style.display = 'none'; document.getElementById('2502.09977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09172">arXiv:2502.09172</a> <span> [<a href="https://arxiv.org/pdf/2502.09172">pdf</a>, <a href="https://arxiv.org/format/2502.09172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Trading and Market Microstructure">q-fin.TR</span> </div> </div> <p class="title is-5 mathjax"> LOB-Bench: Benchmarking Generative AI for Finance - an Application to Limit Order Book Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nagy%2C+P">Peer Nagy</a>, <a href="/search/cs?searchtype=author&query=Frey%2C+S">Sascha Frey</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a>, <a href="/search/cs?searchtype=author&query=Sarkar%2C+B">Bidipta Sarkar</a>, <a href="/search/cs?searchtype=author&query=Vyetrenko%2C+S">Svitlana Vyetrenko</a>, <a href="/search/cs?searchtype=author&query=Zohren%2C+S">Stefan Zohren</a>, <a href="/search/cs?searchtype=author&query=Calinescu%2C+A">Ani Calinescu</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+J">Jakob Foerster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09172v1-abstract-short" style="display: inline;"> While financial data presents one of the most challenging and interesting sequence modelling tasks due to high noise, heavy tails, and strategic interactions, progress in this area has been hindered by the lack of consensus on quantitative evaluation paradigms. To address this, we present LOB-Bench, a benchmark, implemented in python, designed to evaluate the quality and realism of generative mess… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09172v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09172v1-abstract-full" style="display: none;"> While financial data presents one of the most challenging and interesting sequence modelling tasks due to high noise, heavy tails, and strategic interactions, progress in this area has been hindered by the lack of consensus on quantitative evaluation paradigms. To address this, we present LOB-Bench, a benchmark, implemented in python, designed to evaluate the quality and realism of generative message-by-order data for limit order books (LOB) in the LOBSTER format. Our framework measures distributional differences in conditional and unconditional statistics between generated and real LOB data, supporting flexible multivariate statistical evaluation. The benchmark also includes features commonly used LOB statistics such as spread, order book volumes, order imbalance, and message inter-arrival times, along with scores from a trained discriminator network. Lastly, LOB-Bench contains "market impact metrics", i.e. the cross-correlations and price response functions for specific events in the data. We benchmark generative autoregressive state-space models, a (C)GAN, as well as a parametric LOB model and find that the autoregressive GenAI approach beats traditional model classes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09172v1-abstract-full').style.display = 'none'; document.getElementById('2502.09172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08975">arXiv:2502.08975</a> <span> [<a href="https://arxiv.org/pdf/2502.08975">pdf</a>, <a href="https://arxiv.org/format/2502.08975">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Small Molecule Drug Discovery Through Deep Learning:Progress, Challenges, and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yida Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiantao Cai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenbin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08975v1-abstract-short" style="display: inline;"> Due to their excellent drug-like and pharmacokinetic properties, small molecule drugs are widely used to treat various diseases, making them a critical component of drug discovery. In recent years, with the rapid development of deep learning (DL) techniques, DL-based small molecule drug discovery methods have achieved excellent performance in prediction accuracy, speed, and complex molecular relat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08975v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08975v1-abstract-full" style="display: none;"> Due to their excellent drug-like and pharmacokinetic properties, small molecule drugs are widely used to treat various diseases, making them a critical component of drug discovery. In recent years, with the rapid development of deep learning (DL) techniques, DL-based small molecule drug discovery methods have achieved excellent performance in prediction accuracy, speed, and complex molecular relationship modeling compared to traditional machine learning approaches. These advancements enhance drug screening efficiency and optimization, and they provide more precise and effective solutions for various drug discovery tasks. Contributing to this field's development, this paper aims to systematically summarize and generalize the recent key tasks and representative techniques in DL-based small molecule drug discovery in recent years. Specifically, we provide an overview of the major tasks in small molecule drug discovery and their interrelationships. Next, we analyze the six core tasks, summarizing the related methods, commonly used datasets, and technological development trends. Finally, we discuss key challenges, such as interpretability and out-of-distribution generalization, and offer our insights into future research directions for DL-assisted small molecule drug discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08975v1-abstract-full').style.display = 'none'; document.getElementById('2502.08975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08622">arXiv:2502.08622</a> <span> [<a href="https://arxiv.org/pdf/2502.08622">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Forecasting Drought Using Machine Learning in California </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+N+K">Nan K. Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A">Angela Chang</a>, <a href="/search/cs?searchtype=author&query=Sherman%2C+D">David Sherman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08622v1-abstract-short" style="display: inline;"> Drought is a frequent and costly natural disaster in California, with major negative impacts on agricultural production and water resource availability, particularly groundwater. This study investigated the performance of applying different machine learning approaches to predicting the U.S. Drought Monitor classification in California. Four approaches were used: a convolutional neural network (CNN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08622v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08622v1-abstract-full" style="display: none;"> Drought is a frequent and costly natural disaster in California, with major negative impacts on agricultural production and water resource availability, particularly groundwater. This study investigated the performance of applying different machine learning approaches to predicting the U.S. Drought Monitor classification in California. Four approaches were used: a convolutional neural network (CNN), random forest, XGBoost, and long short term memory (LSTM) recurrent neural network, and compared to a baseline persistence model. We evaluated the models' performance in predicting severe drought (USDM drought category D2 or higher) using a macro F1 binary classification metric. The LSTM model emerged as the top performer, followed by XGBoost, CNN, and random forest. Further evaluation of our results at the county level suggested that the LSTM model would perform best in counties with more consistent drought patterns and where severe drought was more common, and the LSTM model would perform worse where drought scores increased rapidly. Utilizing 30 weeks of historical data, the LSTM model successfully forecasted drought scores for a 12-week period with a Mean Absolute Error (MAE) of 0.33, equivalent to less than half a drought category on a scale of 0 to 5. Additionally, the LSTM achieved a macro F1 score of 0.9, indicating high accuracy in binary classification for severe drought conditions. Evaluation of different window and future horizon sizes in weeks suggested that at least 24 weeks of data would result in the best performance, with best performance for shorter horizon sizes, particularly less than eight weeks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08622v1-abstract-full').style.display = 'none'; document.getElementById('2502.08622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07802">arXiv:2502.07802</a> <span> [<a href="https://arxiv.org/pdf/2502.07802">pdf</a>, <a href="https://arxiv.org/format/2502.07802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Movie Weaver: Tuning-Free Multi-Concept Video Personalization with Anchored Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+F">Feng Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Haoyu Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zecheng He</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+T">Tingbo Hou</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Ji Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kunpeng Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Azadi%2C+S">Samaneh Azadi</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+A">Animesh Sinha</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peizhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Vajda%2C+P">Peter Vajda</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+D">Diana Marculescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07802v1-abstract-short" style="display: inline;"> Video personalization, which generates customized videos using reference images, has gained significant attention. However, prior methods typically focus on single-concept personalization, limiting broader applications that require multi-concept integration. Attempts to extend these models to multiple concepts often lead to identity blending, which results in composite characters with fused attrib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07802v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07802v1-abstract-full" style="display: none;"> Video personalization, which generates customized videos using reference images, has gained significant attention. However, prior methods typically focus on single-concept personalization, limiting broader applications that require multi-concept integration. Attempts to extend these models to multiple concepts often lead to identity blending, which results in composite characters with fused attributes from multiple sources. This challenge arises due to the lack of a mechanism to link each concept with its specific reference image. We address this with anchored prompts, which embed image anchors as unique tokens within text prompts, guiding accurate referencing during generation. Additionally, we introduce concept embeddings to encode the order of reference images. Our approach, Movie Weaver, seamlessly weaves multiple concepts-including face, body, and animal images-into one video, allowing flexible combinations in a single model. The evaluation shows that Movie Weaver outperforms existing methods for multi-concept video personalization in identity preservation and overall quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07802v1-abstract-full').style.display = 'none'; document.getElementById('2502.07802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jeff-liangf.github.io/projects/movieweaver/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07750">arXiv:2502.07750</a> <span> [<a href="https://arxiv.org/pdf/2502.07750">pdf</a>, <a href="https://arxiv.org/format/2502.07750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PFedDST: Personalized Federated Learning with Decentralized Selection Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+M">Mengchen Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Keren Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qing Tian</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+B">Baocheng Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07750v1-abstract-short" style="display: inline;"> Distributed Learning (DL) enables the training of machine learning models across multiple devices, yet it faces challenges like non-IID data distributions and device capability disparities, which can impede training efficiency. Communication bottlenecks further complicate traditional Federated Learning (FL) setups. To mitigate these issues, we introduce the Personalized Federated Learning with Dec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07750v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07750v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07750v1-abstract-full" style="display: none;"> Distributed Learning (DL) enables the training of machine learning models across multiple devices, yet it faces challenges like non-IID data distributions and device capability disparities, which can impede training efficiency. Communication bottlenecks further complicate traditional Federated Learning (FL) setups. To mitigate these issues, we introduce the Personalized Federated Learning with Decentralized Selection Training (PFedDST) framework. PFedDST enhances model training by allowing devices to strategically evaluate and select peers based on a comprehensive communication score. This score integrates loss, task similarity, and selection frequency, ensuring optimal peer connections. This selection strategy is tailored to increase local personalization and promote beneficial peer collaborations to strengthen the stability and efficiency of the training process. Our experiments demonstrate that PFedDST not only enhances model accuracy but also accelerates convergence. This approach outperforms state-of-the-art methods in handling data heterogeneity, delivering both faster and more effective training in diverse and decentralized systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07750v1-abstract-full').style.display = 'none'; document.getElementById('2502.07750v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06189">arXiv:2502.06189</a> <span> [<a href="https://arxiv.org/pdf/2502.06189">pdf</a>, <a href="https://arxiv.org/format/2502.06189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Level Decoupled Relational Distillation for Heterogeneous Architectures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaoxin Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+P">Peng Ye</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weihao Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kangcong Li</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yan Wen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+J">Jia Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06189v1-abstract-short" style="display: inline;"> Heterogeneous distillation is an effective way to transfer knowledge from cross-architecture teacher models to student models. However, existing heterogeneous distillation methods do not take full advantage of the dark knowledge hidden in the teacher's output, limiting their performance.To this end, we propose a novel framework named Multi-Level Decoupled Relational Knowledge Distillation (MLDR-KD… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06189v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06189v1-abstract-full" style="display: none;"> Heterogeneous distillation is an effective way to transfer knowledge from cross-architecture teacher models to student models. However, existing heterogeneous distillation methods do not take full advantage of the dark knowledge hidden in the teacher's output, limiting their performance.To this end, we propose a novel framework named Multi-Level Decoupled Relational Knowledge Distillation (MLDR-KD) to unleash the potential of relational distillation in heterogeneous distillation. Concretely, we first introduce Decoupled Finegrained Relation Alignment (DFRA) in both logit and feature levels to balance the trade-off between distilled dark knowledge and the confidence in the correct category of the heterogeneous teacher model. Then, Multi-Scale Dynamic Fusion (MSDF) module is applied to dynamically fuse the projected logits of multiscale features at different stages in student model, further improving performance of our method in feature level. We verify our method on four architectures (CNNs, Transformers, MLPs and Mambas), two datasets (CIFAR-100 and Tiny-ImageNet). Compared with the best available method, our MLDR-KD improves student model performance with gains of up to 4.86% on CIFAR-100 and 2.78% on Tiny-ImageNet datasets respectively, showing robustness and generality in heterogeneous distillation. Code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06189v1-abstract-full').style.display = 'none'; document.getElementById('2502.06189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05423">arXiv:2502.05423</a> <span> [<a href="https://arxiv.org/pdf/2502.05423">pdf</a>, <a href="https://arxiv.org/format/2502.05423">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LRA-GNN: Latent Relation-Aware Graph Neural Network with Initial and Dynamic Residual for Facial Age Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiping Zhang</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+Y">Yuntao Shou</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+W">Wei Ai</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+T">Tao Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Keqin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05423v1-abstract-short" style="display: inline;"> Face information is mainly concentrated among facial key points, and frontier research has begun to use graph neural networks to segment faces into patches as nodes to model complex face representations. However, these methods construct node-to-node relations based on similarity thresholds, so there is a problem that some latent relations are missing. These latent relations are crucial for deep se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05423v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05423v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05423v1-abstract-full" style="display: none;"> Face information is mainly concentrated among facial key points, and frontier research has begun to use graph neural networks to segment faces into patches as nodes to model complex face representations. However, these methods construct node-to-node relations based on similarity thresholds, so there is a problem that some latent relations are missing. These latent relations are crucial for deep semantic representation of face aging. In this novel, we propose a new Latent Relation-Aware Graph Neural Network with Initial and Dynamic Residual (LRA-GNN) to achieve robust and comprehensive facial representation. Specifically, we first construct an initial graph utilizing facial key points as prior knowledge, and then a random walk strategy is employed to the initial graph for obtaining the global structure, both of which together guide the subsequent effective exploration and comprehensive representation. Then LRA-GNN leverages the multi-attention mechanism to capture the latent relations and generates a set of fully connected graphs containing rich facial information and complete structure based on the aforementioned guidance. To avoid over-smoothing issues for deep feature extraction on the fully connected graphs, the deep residual graph convolutional networks are carefully designed, which fuse adaptive initial residuals and dynamic developmental residuals to ensure the consistency and diversity of information. Finally, to improve the estimation accuracy and generalization ability, progressive reinforcement learning is proposed to optimize the ensemble classification regressor. Our proposed framework surpasses the state-of-the-art baselines on several age estimation benchmarks, demonstrating its strength and effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05423v1-abstract-full').style.display = 'none'; document.getElementById('2502.05423v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05177">arXiv:2502.05177</a> <span> [<a href="https://arxiv.org/pdf/2502.05177">pdf</a>, <a href="https://arxiv.org/ps/2502.05177">ps</a>, <a href="https://arxiv.org/format/2502.05177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long-VITA: Scaling Large Multi-modal Models to 1 Million Tokens with Leading Short-Context Accuray </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shaoqi Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiong Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peixian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengdan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Haoyu Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiawu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yiyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xing Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05177v1-abstract-short" style="display: inline;"> Establishing the long-context capability of large vision-language models is crucial for video understanding, high-resolution image understanding, multi-modal agents and reasoning. We introduce Long-VITA, a simple yet effective large multi-modal model for long-context visual-language understanding tasks. It is adept at concurrently processing and analyzing modalities of image, video, and text over… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05177v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05177v1-abstract-full" style="display: none;"> Establishing the long-context capability of large vision-language models is crucial for video understanding, high-resolution image understanding, multi-modal agents and reasoning. We introduce Long-VITA, a simple yet effective large multi-modal model for long-context visual-language understanding tasks. It is adept at concurrently processing and analyzing modalities of image, video, and text over 4K frames or 1M tokens while delivering advanced performances on short-context multi-modal tasks. We propose an effective multi-modal training schema that starts with large language models and proceeds through vision-language alignment, general knowledge learning, and two sequential stages of long-sequence fine-tuning. We further implement context-parallelism distributed inference and logits-masked language modeling head to scale Long-VITA to infinitely long inputs of images and texts during model inference. Regarding training data, Long-VITA is built on a mix of $17$M samples from public datasets only and demonstrates the state-of-the-art performance on various multi-modal benchmarks, compared against recent cutting-edge models with internal data. Long-VITA is fully reproducible and supports both NPU and GPU platforms for training and testing. We hope Long-VITA can serve as a competitive baseline and offer valuable insights for the open-source community in advancing long-context multi-modal understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05177v1-abstract-full').style.display = 'none'; document.getElementById('2502.05177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/VITA-MLLM/Long-VITA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05130">arXiv:2502.05130</a> <span> [<a href="https://arxiv.org/pdf/2502.05130">pdf</a>, <a href="https://arxiv.org/format/2502.05130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Latent Swap Joint Diffusion for Long-Form Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yusheng Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kewei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiefeng Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05130v1-abstract-short" style="display: inline;"> Previous work on long-form audio generation using global-view diffusion or iterative generation demands significant training or inference costs. While recent advancements in multi-view joint diffusion for panoramic generation provide an efficient option, they struggle with spectrum generation with severe overlap distortions and high cross-view consistency costs. We initially explore this phenomeno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05130v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05130v1-abstract-full" style="display: none;"> Previous work on long-form audio generation using global-view diffusion or iterative generation demands significant training or inference costs. While recent advancements in multi-view joint diffusion for panoramic generation provide an efficient option, they struggle with spectrum generation with severe overlap distortions and high cross-view consistency costs. We initially explore this phenomenon through the connectivity inheritance of latent maps and uncover that averaging operations excessively smooth the high-frequency components of the latent map. To address these issues, we propose Swap Forward (SaFa), a frame-level latent swap framework that synchronizes multiple diffusions to produce a globally coherent long audio with more spectrum details in a forward-only manner. At its core, the bidirectional Self-Loop Latent Swap is applied between adjacent views, leveraging stepwise diffusion trajectory to adaptively enhance high-frequency components without disrupting low-frequency components. Furthermore, to ensure cross-view consistency, the unidirectional Reference-Guided Latent Swap is applied between the reference and the non-overlap regions of each subview during the early stages, providing centralized trajectory guidance. Quantitative and qualitative experiments demonstrate that SaFa significantly outperforms existing joint diffusion methods and even training-based long audio generation models. Moreover, we find that it also adapts well to panoramic generation, achieving comparable state-of-the-art performance with greater efficiency and model generalizability. Project page is available at https://swapforward.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05130v1-abstract-full').style.display = 'none'; document.getElementById('2502.05130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04394">arXiv:2502.04394</a> <span> [<a href="https://arxiv.org/pdf/2502.04394">pdf</a>, <a href="https://arxiv.org/format/2502.04394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DECT: Harnessing LLM-assisted Fine-Grained Linguistic Knowledge and Label-Switched and Label-Preserved Data Generation for Diagnosis of Alzheimer's Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tingyu Mo</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+J+C+K">Jacqueline C. K. Lam</a>, <a href="/search/cs?searchtype=author&query=Li%2C+V+O+K">Victor O. K. Li</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+L+Y+L">Lawrence Y. L. Cheung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04394v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) is an irreversible neurodegenerative disease affecting 50 million people worldwide. Low-cost, accurate identification of key markers of AD is crucial for timely diagnosis and intervention. Language impairment is one of the earliest signs of cognitive decline, which can be used to discriminate AD patients from normal control individuals. Patient-interviewer dialogues may be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04394v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04394v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04394v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) is an irreversible neurodegenerative disease affecting 50 million people worldwide. Low-cost, accurate identification of key markers of AD is crucial for timely diagnosis and intervention. Language impairment is one of the earliest signs of cognitive decline, which can be used to discriminate AD patients from normal control individuals. Patient-interviewer dialogues may be used to detect such impairments, but they are often mixed with ambiguous, noisy, and irrelevant information, making the AD detection task difficult. Moreover, the limited availability of AD speech samples and variability in their speech styles pose significant challenges in developing robust speech-based AD detection models. To address these challenges, we propose DECT, a novel speech-based domain-specific approach leveraging large language models (LLMs) for fine-grained linguistic analysis and label-switched label-preserved data generation. Our study presents four novelties: We harness the summarizing capabilities of LLMs to identify and distill key Cognitive-Linguistic information from noisy speech transcripts, effectively filtering irrelevant information. We leverage the inherent linguistic knowledge of LLMs to extract linguistic markers from unstructured and heterogeneous audio transcripts. We exploit the compositional ability of LLMs to generate AD speech transcripts consisting of diverse linguistic patterns to overcome the speech data scarcity challenge and enhance the robustness of AD detection models. We use the augmented AD textual speech transcript dataset and a more fine-grained representation of AD textual speech transcript data to fine-tune the AD detection model. The results have shown that DECT demonstrates superior model performance with an 11% improvement in AD detection accuracy on the datasets from DementiaBank compared to the baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04394v1-abstract-full').style.display = 'none'; document.getElementById('2502.04394v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03569">arXiv:2502.03569</a> <span> [<a href="https://arxiv.org/pdf/2502.03569">pdf</a>, <a href="https://arxiv.org/format/2502.03569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Populations and Evolution">q-bio.PE</span> </div> </div> <p class="title is-5 mathjax"> Controllable Sequence Editing for Counterfactual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M+M">Michelle M. Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kevin Li</a>, <a href="/search/cs?searchtype=author&query=Ektefaie%2C+Y">Yasha Ektefaie</a>, <a href="/search/cs?searchtype=author&query=Messica%2C+S">Shvat Messica</a>, <a href="/search/cs?searchtype=author&query=Zitnik%2C+M">Marinka Zitnik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03569v1-abstract-short" style="display: inline;"> Sequence models generate counterfactuals by modifying parts of a sequence based on a given condition, enabling reasoning about "what if" scenarios. While these models excel at conditional generation, they lack fine-grained control over when and where edits occur. Existing approaches either focus on univariate sequences or assume that interventions affect the entire sequence globally. However, many… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03569v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03569v1-abstract-full" style="display: none;"> Sequence models generate counterfactuals by modifying parts of a sequence based on a given condition, enabling reasoning about "what if" scenarios. While these models excel at conditional generation, they lack fine-grained control over when and where edits occur. Existing approaches either focus on univariate sequences or assume that interventions affect the entire sequence globally. However, many applications require precise, localized modifications, where interventions take effect only after a specified time and impact only a subset of co-occurring variables. We introduce CLEF, a controllable sequence editing model for counterfactual reasoning about both immediate and delayed effects. CLEF learns temporal concepts that encode how and when interventions should influence a sequence. With these concepts, CLEF selectively edits relevant time steps while preserving unaffected portions of the sequence. We evaluate CLEF on cellular and patient trajectory datasets, where gene regulation affects only certain genes at specific time steps, or medical interventions alter only a subset of lab measurements. CLEF improves immediate sequence editing by up to 36.01% in MAE compared to baselines. Unlike prior methods, CLEF enables one-step generation of counterfactual sequences at any future time step, outperforming baselines by up to 65.71% in MAE. A case study on patients with type 1 diabetes mellitus shows that CLEF identifies clinical interventions that shift patient trajectories toward healthier outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03569v1-abstract-full').style.display = 'none'; document.getElementById('2502.03569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03297">arXiv:2502.03297</a> <span> [<a href="https://arxiv.org/pdf/2502.03297">pdf</a>, <a href="https://arxiv.org/format/2502.03297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IRIS: An Immersive Robot Interaction System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinkai Jiang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Q">Qihao Yuan</a>, <a href="/search/cs?searchtype=author&query=Dincer%2C+E+U">Enes Ulas Dincer</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xueyin Li</a>, <a href="/search/cs?searchtype=author&query=Haag%2C+J">Julius Haag</a>, <a href="/search/cs?searchtype=author&query=Schreiber%2C+N">Nicolas Schreiber</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kailai Li</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a>, <a href="/search/cs?searchtype=author&query=Lioutikov%2C+R">Rudolf Lioutikov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03297v1-abstract-short" style="display: inline;"> This paper introduces IRIS, an immersive Robot Interaction System leveraging Extended Reality (XR), designed for robot data collection and interaction across multiple simulators, benchmarks, and real-world scenarios. While existing XR-based data collection systems provide efficient and intuitive solutions for large-scale data collection, they are often challenging to reproduce and reuse. This limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03297v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03297v1-abstract-full" style="display: none;"> This paper introduces IRIS, an immersive Robot Interaction System leveraging Extended Reality (XR), designed for robot data collection and interaction across multiple simulators, benchmarks, and real-world scenarios. While existing XR-based data collection systems provide efficient and intuitive solutions for large-scale data collection, they are often challenging to reproduce and reuse. This limitation arises because current systems are highly tailored to simulator-specific use cases and environments. IRIS is a novel, easily extendable framework that already supports multiple simulators, benchmarks, and even headsets. Furthermore, IRIS is able to include additional information from real-world sensors, such as point clouds captured through depth cameras. A unified scene specification is generated directly from simulators or real-world sensors and transmitted to XR headsets, creating identical scenes in XR. This specification allows IRIS to support any of the objects, assets, and robots provided by the simulators. In addition, IRIS introduces shared spatial anchors and a robust communication protocol that links simulations between multiple XR headsets. This feature enables multiple XR headsets to share a synchronized scene, facilitating collaborative and multi-user data collection. IRIS can be deployed on any device that supports the Unity Framework, encompassing the vast majority of commercially available headsets. In this work, IRIS was deployed and tested on the Meta Quest 3 and the HoloLens 2. IRIS showcased its versatility across a wide range of real-world and simulated scenarios, using current popular robot simulators such as MuJoCo, IsaacSim, CoppeliaSim, and Genesis. In addition, a user study evaluates IRIS on a data collection task for the LIBERO benchmark. The study shows that IRIS significantly outperforms the baseline in both objective and subjective metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03297v1-abstract-full').style.display = 'none'; document.getElementById('2502.03297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02196">arXiv:2502.02196</a> <span> [<a href="https://arxiv.org/pdf/2502.02196">pdf</a>, <a href="https://arxiv.org/format/2502.02196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Ensemble Learning for Cross-View Isolated Sign Language Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yiqi Nie</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zhangling Duan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+P">Peng Zou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiliang Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yanyan Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02196v1-abstract-short" style="display: inline;"> In this paper, we present our solution to the Cross-View Isolated Sign Language Recognition (CV-ISLR) challenge held at WWW 2025. CV-ISLR addresses a critical issue in traditional Isolated Sign Language Recognition (ISLR), where existing datasets predominantly capture sign language videos from a frontal perspective, while real-world camera angles often vary. To accurately recognize sign language f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02196v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02196v1-abstract-full" style="display: none;"> In this paper, we present our solution to the Cross-View Isolated Sign Language Recognition (CV-ISLR) challenge held at WWW 2025. CV-ISLR addresses a critical issue in traditional Isolated Sign Language Recognition (ISLR), where existing datasets predominantly capture sign language videos from a frontal perspective, while real-world camera angles often vary. To accurately recognize sign language from different viewpoints, models must be capable of understanding gestures from multiple angles, making cross-view recognition challenging. To address this, we explore the advantages of ensemble learning, which enhances model robustness and generalization across diverse views. Our approach, built on a multi-dimensional Video Swin Transformer model, leverages this ensemble strategy to achieve competitive performance. Finally, our solution ranked 3rd in both the RGB-based ISLR and RGB-D-based ISLR tracks, demonstrating the effectiveness in handling the challenges of cross-view recognition. The code is available at: https://github.com/Jiafei127/CV_ISLR_WWW2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02196v1-abstract-full').style.display = 'none'; document.getElementById('2502.02196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3rd Place in Cross-View Isolated Sign Language Recognition Challenge at WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01683">arXiv:2502.01683</a> <span> [<a href="https://arxiv.org/pdf/2502.01683">pdf</a>, <a href="https://arxiv.org/format/2502.01683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM-Powered Benchmark Factory: Reliable, Generic, and Efficient </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+P">Peiwen Yuan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shaoxiong Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinglin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yueqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiayi Shi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chuyi Tan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Boyuan Pan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01683v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has led to a surge in both model supply and application demands. To facilitate effective matching between them, reliable, generic and efficient benchmark generators are widely needed. However, human annotators are constrained by inefficiency, and current LLM benchmark generators not only lack generalizability but also struggle with limited reli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01683v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01683v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has led to a surge in both model supply and application demands. To facilitate effective matching between them, reliable, generic and efficient benchmark generators are widely needed. However, human annotators are constrained by inefficiency, and current LLM benchmark generators not only lack generalizability but also struggle with limited reliability, as they lack a comprehensive evaluation framework for validation and optimization. To fill this gap, we first propose an automated and unbiased evaluation framework, structured around four dimensions and ten criteria. Under this framework, we carefully analyze the advantages and weaknesses of directly prompting LLMs as generic benchmark generators. To enhance the reliability, we introduce a series of methods to address the identified weaknesses and integrate them as BenchMaker. Experiments across multiple LLMs and tasks confirm that BenchMaker achieves superior or comparable performance to human-annotated benchmarks on all metrics, highlighting its generalizability and reliability. More importantly, it delivers highly consistent evaluation results across 12 LLMs (0.967 Pearson correlation against MMLU-Pro), while taking only $0.005 and 0.38 minutes per sample. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01683v1-abstract-full').style.display = 'none'; document.getElementById('2502.01683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01201">arXiv:2502.01201</a> <span> [<a href="https://arxiv.org/pdf/2502.01201">pdf</a>, <a href="https://arxiv.org/format/2502.01201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-to-Normal: Anomaly Personalization for Few-shot Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiyue Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+Q">Qicheng Lao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01201v1-abstract-short" style="display: inline;"> Traditional Anomaly Detection (AD) methods have predominantly relied on unsupervised learning from extensive normal data. Recent AD methods have evolved with the advent of large pre-trained vision-language models, enhancing few-shot anomaly detection capabilities. However, these latest AD methods still exhibit limitations in accuracy improvement. One contributing factor is their direct comparison… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01201v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01201v1-abstract-full" style="display: none;"> Traditional Anomaly Detection (AD) methods have predominantly relied on unsupervised learning from extensive normal data. Recent AD methods have evolved with the advent of large pre-trained vision-language models, enhancing few-shot anomaly detection capabilities. However, these latest AD methods still exhibit limitations in accuracy improvement. One contributing factor is their direct comparison of a query image's features with those of few-shot normal images. This direct comparison often leads to a loss of precision and complicates the extension of these techniques to more complex domains--an area that remains underexplored in a more refined and comprehensive manner. To address these limitations, we introduce the anomaly personalization method, which performs a personalized one-to-normal transformation of query images using an anomaly-free customized generation model, ensuring close alignment with the normal manifold. Moreover, to further enhance the stability and robustness of prediction results, we propose a triplet contrastive anomaly inference strategy, which incorporates a comprehensive comparison between the query and generated anomaly-free data pool and prompt information. Extensive evaluations across eleven datasets in three domains demonstrate our model's effectiveness compared to the latest AD methods. Additionally, our method has been proven to transfer flexibly to other AD methods, with the generated image data effectively improving the performance of other AD methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01201v1-abstract-full').style.display = 'none'; document.getElementById('2502.01201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In The Thirty-eighth Annual Conference on Neural Information Processing Systems (NeurIPS2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01181">arXiv:2502.01181</a> <span> [<a href="https://arxiv.org/pdf/2502.01181">pdf</a>, <a href="https://arxiv.org/format/2502.01181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BVINet: Unlocking Blind Video Inpainting with Zero Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiliang Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kerui Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hehe Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01181v1-abstract-short" style="display: inline;"> Video inpainting aims to fill in corrupted regions of the video with plausible contents. Existing methods generally assume that the locations of corrupted regions are known, focusing primarily on the "how to inpaint". This reliance necessitates manual annotation of the corrupted regions using binary masks to indicate "whereto inpaint". However, the annotation of these masks is labor-intensive and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01181v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01181v1-abstract-full" style="display: none;"> Video inpainting aims to fill in corrupted regions of the video with plausible contents. Existing methods generally assume that the locations of corrupted regions are known, focusing primarily on the "how to inpaint". This reliance necessitates manual annotation of the corrupted regions using binary masks to indicate "whereto inpaint". However, the annotation of these masks is labor-intensive and expensive, limiting the practicality of current methods. In this paper, we expect to relax this assumption by defining a new blind video inpainting setting, enabling the networks to learn the mapping from corrupted video to inpainted result directly, eliminating the need of corrupted region annotations. Specifically, we propose an end-to-end blind video inpainting network (BVINet) to address both "where to inpaint" and "how to inpaint" simultaneously. On the one hand, BVINet can predict the masks of corrupted regions by detecting semantic-discontinuous regions of the frame and utilizing temporal consistency prior of the video. On the other hand, the predicted masks are incorporated into the BVINet, allowing it to capture valid context information from uncorrupted regions to fill in corrupted ones. Besides, we introduce a consistency loss to regularize the training parameters of BVINet. In this way, mask prediction and video completion mutually constrain each other, thereby maximizing the overall performance of the trained model. Furthermore, we customize a dataset consisting of synthetic corrupted videos, real-world corrupted videos, and their corresponding completed videos. This dataset serves as a valuable resource for advancing blind video inpainting research. Extensive experimental results demonstrate the effectiveness and superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01181v1-abstract-full').style.display = 'none'; document.getElementById('2502.01181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00709">arXiv:2502.00709</a> <span> [<a href="https://arxiv.org/pdf/2502.00709">pdf</a>, <a href="https://arxiv.org/format/2502.00709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RankFlow: A Multi-Role Collaborative Reranking Workflow Utilizing Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Can Jin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hongwu Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Anxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiahui Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xi Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kuangzheng Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shuya Feng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+K">Kai Zhong</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Caiwen Ding</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00709v2-abstract-short" style="display: inline;"> In an Information Retrieval (IR) system, reranking plays a critical role by sorting candidate passages according to their relevance to a specific query. This process demands a nuanced understanding of the variations among passages linked to the query. In this work, we introduce RankFlow, a multi-role reranking workflow that leverages the capabilities of Large Language Models (LLMs) and role specia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00709v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00709v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00709v2-abstract-full" style="display: none;"> In an Information Retrieval (IR) system, reranking plays a critical role by sorting candidate passages according to their relevance to a specific query. This process demands a nuanced understanding of the variations among passages linked to the query. In this work, we introduce RankFlow, a multi-role reranking workflow that leverages the capabilities of Large Language Models (LLMs) and role specializations to improve reranking performance. RankFlow enlists LLMs to fulfill four distinct roles: the query Rewriter, the pseudo Answerer, the passage Summarizer, and the Reranker. This orchestrated approach enables RankFlow to: (1) accurately interpret queries, (2) draw upon LLMs' extensive pre-existing knowledge, (3) distill passages into concise versions, and (4) assess passages in a comprehensive manner, resulting in notably better reranking results. Our experimental results reveal that RankFlow outperforms existing leading approaches on widely recognized IR benchmarks, such as TREC-DL, BEIR, and NovelEval. Additionally, we investigate the individual contributions of each role in RankFlow. Code is available at https://github.com/jincan333/RankFlow. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00709v2-abstract-full').style.display = 'none'; document.getElementById('2502.00709v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19135">arXiv:2501.19135</a> <span> [<a href="https://arxiv.org/pdf/2501.19135">pdf</a>, <a href="https://arxiv.org/format/2501.19135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> A Tensor-Train Decomposition based Compression of LLMs on Group Vector Systolic Accelerator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sixiao Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tintin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+A">Ao Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Keyao Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mingqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19135v1-abstract-short" style="display: inline;"> Large language models (LLMs) are both storage-intensive and computation-intensive, posing significant challenges when deployed on resource-constrained hardware. As linear layers in LLMs are mainly resource consuming parts, this paper develops a tensor-train decomposition (TTD) for LLMs with a further hardware implementation on FPGA. TTD compression is applied to the linear layers in ChatGLM3-6B an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19135v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19135v1-abstract-full" style="display: none;"> Large language models (LLMs) are both storage-intensive and computation-intensive, posing significant challenges when deployed on resource-constrained hardware. As linear layers in LLMs are mainly resource consuming parts, this paper develops a tensor-train decomposition (TTD) for LLMs with a further hardware implementation on FPGA. TTD compression is applied to the linear layers in ChatGLM3-6B and LLaMA2-7B models with compression ratios (CRs) for the whole network 1.94$\times$ and 1.60$\times$, respectively. The compressed LLMs are further implemented on FPGA hardware within a highly efficient group vector systolic array (GVSA) architecture, which has DSP-shared parallel vector PEs for TTD inference, as well as optimized data communication in the accelerator. Experimental results show that the corresponding TTD based LLM accelerator implemented on FPGA achieves 1.45$\times$ and 1.57$\times$ reduction in first token delay for ChatGLM3-6B and LLaMA2-7B models, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19135v1-abstract-full').style.display = 'none'; document.getElementById('2501.19135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18783">arXiv:2501.18783</a> <span> [<a href="https://arxiv.org/pdf/2501.18783">pdf</a>, <a href="https://arxiv.org/format/2501.18783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RUN: Reversible Unfolding Network for Concealed Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+C">Chunming He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Fengyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chenyu Fang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Longxiang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Linghe Kong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Deng-Ping Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Farsiu%2C+S">Sina Farsiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18783v1-abstract-short" style="display: inline;"> Existing concealed object segmentation (COS) methods frequently utilize reversible strategies to address uncertain regions. However, these approaches are typically restricted to the mask domain, leaving the potential of the RGB domain underexplored. To address this, we propose the Reversible Unfolding Network (RUN), which applies reversible strategies across both mask and RGB domains through a the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18783v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18783v1-abstract-full" style="display: none;"> Existing concealed object segmentation (COS) methods frequently utilize reversible strategies to address uncertain regions. However, these approaches are typically restricted to the mask domain, leaving the potential of the RGB domain underexplored. To address this, we propose the Reversible Unfolding Network (RUN), which applies reversible strategies across both mask and RGB domains through a theoretically grounded framework, enabling accurate segmentation. RUN first formulates a novel COS model by incorporating an extra residual sparsity constraint to minimize segmentation uncertainties. The iterative optimization steps of the proposed model are then unfolded into a multistage network, with each step corresponding to a stage. Each stage of RUN consists of two reversible modules: the Segmentation-Oriented Foreground Separation (SOFS) module and the Reconstruction-Oriented Background Extraction (ROBE) module. SOFS applies the reversible strategy at the mask level and introduces Reversible State Space to capture non-local information. ROBE extends this to the RGB domain, employing a reconstruction network to address conflicting foreground and background regions identified as distortion-prone areas, which arise from their separate estimation by independent modules. As the stages progress, RUN gradually facilitates reversible modeling of foreground and background in both the mask and RGB domains, directing the network's attention to uncertain regions and mitigating false-positive and false-negative results. Extensive experiments demonstrate the superior performance of RUN and highlight the potential of unfolding-based frameworks for COS and other high-level vision tasks. We will release the code and models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18783v1-abstract-full').style.display = 'none'; document.getElementById('2501.18783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 tables, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18351">arXiv:2501.18351</a> <span> [<a href="https://arxiv.org/pdf/2501.18351">pdf</a>, <a href="https://arxiv.org/format/2501.18351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dual-BEV Nav: Dual-layer BEV-based Heuristic Path Planning for Robotic Navigation in Unstructured Outdoor Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanlin Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiahui Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shibo Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xuan Tang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xian Wei</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xiong You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18351v1-abstract-short" style="display: inline;"> Path planning with strong environmental adaptability plays a crucial role in robotic navigation in unstructured outdoor environments, especially in the case of low-quality location and map information. The path planning ability of a robot depends on the identification of the traversability of global and local ground areas. In real-world scenarios, the complexity of outdoor open environments makes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18351v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18351v1-abstract-full" style="display: none;"> Path planning with strong environmental adaptability plays a crucial role in robotic navigation in unstructured outdoor environments, especially in the case of low-quality location and map information. The path planning ability of a robot depends on the identification of the traversability of global and local ground areas. In real-world scenarios, the complexity of outdoor open environments makes it difficult for robots to identify the traversability of ground areas that lack a clearly defined structure. Moreover, most existing methods have rarely analyzed the integration of local and global traversability identifications in unstructured outdoor scenarios. To address this problem, we propose a novel method, Dual-BEV Nav, first introducing Bird's Eye View (BEV) representations into local planning to generate high-quality traversable paths. Then, these paths are projected onto the global traversability map generated by the global BEV planning model to obtain the optimal waypoints. By integrating the traversability from both local and global BEV, we establish a dual-layer BEV heuristic planning paradigm, enabling long-distance navigation in unstructured outdoor environments. We test our approach through both public dataset evaluations and real-world robot deployments, yielding promising results. Compared to baselines, the Dual-BEV Nav improved temporal distance prediction accuracy by up to $18.7\%$. In the real-world deployment, under conditions significantly different from the training set and with notable occlusions in the global BEV, the Dual-BEV Nav successfully achieved a 65-meter-long outdoor navigation. Further analysis demonstrates that the local BEV representation significantly enhances the rationality of the planning, while the global BEV probability map ensures the robustness of the overall planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18351v1-abstract-full').style.display = 'none'; document.getElementById('2501.18351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17598">arXiv:2501.17598</a> <span> [<a href="https://arxiv.org/pdf/2501.17598">pdf</a>, <a href="https://arxiv.org/format/2501.17598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Semantic Consistency Regularization with Large Language Models for Semi-supervised Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kunrong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17598v1-abstract-short" style="display: inline;"> Accurate sentiment analysis of texts is crucial for a variety of applications, such as understanding customer feedback, monitoring market trends, and detecting public sentiment. However, manually annotating large sentiment corpora for supervised learning is labor-intensive and time-consuming. Therefore, it is essential and effective to develop a semi-supervised method for the sentiment analysis ta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17598v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17598v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17598v1-abstract-full" style="display: none;"> Accurate sentiment analysis of texts is crucial for a variety of applications, such as understanding customer feedback, monitoring market trends, and detecting public sentiment. However, manually annotating large sentiment corpora for supervised learning is labor-intensive and time-consuming. Therefore, it is essential and effective to develop a semi-supervised method for the sentiment analysis task. Although some methods have been proposed for semi-supervised text classification, they rely on the intrinsic information within the unlabeled data and the learning capability of the NLP model, which lack generalization ability to the sentiment analysis scenario and may prone to overfit. Inspired by the ability of pretrained Large Language Models (LLMs) in following instructions and generating coherent text, we propose a Semantic Consistency Regularization with Large Language Models (SCR) framework for semi-supervised sentiment analysis. We introduce two prompting strategies to semantically enhance unlabeled text using LLMs. The first is Entity-based Enhancement (SCR-EE), which involves extracting entities and numerical information, and querying the LLM to reconstruct the textual information. The second is Concept-based Enhancement (SCR-CE), which directly queries the LLM with the original sentence for semantic reconstruction. Subsequently, the LLM-augmented data is utilized for a consistency loss with confidence thresholding, which preserves high-quality agreement samples to provide additional supervision signals during training. Furthermore, to fully utilize the uncertain unlabeled data samples, we propose a class re-assembling strategy inspired by the class space shrinking theorem. Experiments show our method achieves remarkable performance over prior semi-supervised methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17598v1-abstract-full').style.display = 'none'; document.getElementById('2501.17598v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICONIP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16327">arXiv:2501.16327</a> <span> [<a href="https://arxiv.org/pdf/2501.16327">pdf</a>, <a href="https://arxiv.org/format/2501.16327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LUCY: Linguistic Understanding and Control Yielding Early Stage of Her </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Heting Gao</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+H">Hang Shao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiong Wang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+C">Chaofan Qiu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+S">Siqi Cai</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuchen Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zihan Xu</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Z">Zuwei Long</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yike Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shaoqi Dong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Long Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xing Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16327v1-abstract-short" style="display: inline;"> The film Her features Samantha, a sophisticated AI audio agent who is capable of understanding both linguistic and paralinguistic information in human speech and delivering real-time responses that are natural, informative and sensitive to emotional subtleties. Moving one step toward more sophisticated audio agent from recent advancement in end-to-end (E2E) speech systems, we propose LUCY, a E2E s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16327v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16327v1-abstract-full" style="display: none;"> The film Her features Samantha, a sophisticated AI audio agent who is capable of understanding both linguistic and paralinguistic information in human speech and delivering real-time responses that are natural, informative and sensitive to emotional subtleties. Moving one step toward more sophisticated audio agent from recent advancement in end-to-end (E2E) speech systems, we propose LUCY, a E2E speech model that (1) senses and responds to user's emotion, (2) deliver responses in a succinct and natural style, and (3) use external tool to answer real-time inquiries. Experiment results show that LUCY is better at emotion control than peer models, generating emotional responses based on linguistic emotional instructions and responding to paralinguistic emotional cues. Lucy is also able to generate responses in a more natural style, as judged by external language models, without sacrificing much performance on general question answering. Finally, LUCY can leverage function calls to answer questions that are out of its knowledge scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16327v1-abstract-full').style.display = 'none'; document.getElementById('2501.16327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Demo Link: https://github.com/VITA-MLLM/LUCY</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16222">arXiv:2501.16222</a> <span> [<a href="https://arxiv.org/pdf/2501.16222">pdf</a>, <a href="https://arxiv.org/format/2501.16222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPECIAL: Zero-shot Hyperspectral Image Classification With CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+L">Li Pang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jing Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiyu Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16222v2-abstract-short" style="display: inline;"> Hyperspectral image (HSI) classification aims at categorizing each pixel in an HSI into a specific land cover class, which is crucial for applications like remote sensing, environmental monitoring, and agriculture. Although deep learning-based HSI classification methods have achieved significant advancements, existing methods still rely on manually labeled data for training, which is both time-con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16222v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16222v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16222v2-abstract-full" style="display: none;"> Hyperspectral image (HSI) classification aims at categorizing each pixel in an HSI into a specific land cover class, which is crucial for applications like remote sensing, environmental monitoring, and agriculture. Although deep learning-based HSI classification methods have achieved significant advancements, existing methods still rely on manually labeled data for training, which is both time-consuming and labor-intensive. To address this limitation, we introduce a novel zero-shot hyperspectral image classification framework based on CLIP (SPECIAL), aiming to eliminate the need for manual annotations. The SPECIAL framework consists of two main stages: (1) CLIP-based pseudo-label generation, and (2) noisy label learning. In the first stage, HSI is spectrally interpolated to produce RGB bands. These bands are subsequently classified using CLIP, resulting in noisy pseudo-labels that are accompanied by confidence scores. To improve the quality of these labels, we propose a scaling strategy that fuses predictions from multiple spatial scales. In the second stage, spectral information and a label refinement technique are incorporated to mitigate label noise and further enhance classification accuracy. Experimental results on three benchmark datasets demonstrate that our SPECIAL outperforms existing methods in zero-shot HSI classification, showing its potential for more practical applications. The code is available at https://github.com/LiPang/SPECIAL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16222v2-abstract-full').style.display = 'none'; document.getElementById('2501.16222v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16174">arXiv:2501.16174</a> <span> [<a href="https://arxiv.org/pdf/2501.16174">pdf</a>, <a href="https://arxiv.org/format/2501.16174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Measuring Heterogeneity in Machine Learning with Distributed Energy Distance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+M">Mengchen Fan</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+B">Baocheng Geng</a>, <a href="/search/cs?searchtype=author&query=Shterenberg%2C+R">Roman Shterenberg</a>, <a href="/search/cs?searchtype=author&query=Casey%2C+J+A">Joseph A. Casey</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Keren Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16174v1-abstract-short" style="display: inline;"> In distributed and federated learning, heterogeneity across data sources remains a major obstacle to effective model aggregation and convergence. We focus on feature heterogeneity and introduce energy distance as a sensitive measure for quantifying distributional discrepancies. While we show that energy distance is robust for detecting data distribution shifts, its direct use in large-scale system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16174v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16174v1-abstract-full" style="display: none;"> In distributed and federated learning, heterogeneity across data sources remains a major obstacle to effective model aggregation and convergence. We focus on feature heterogeneity and introduce energy distance as a sensitive measure for quantifying distributional discrepancies. While we show that energy distance is robust for detecting data distribution shifts, its direct use in large-scale systems can be prohibitively expensive. To address this, we develop Taylor approximations that preserve key theoretical quantitative properties while reducing computational overhead. Through simulation studies, we show how accurately capturing feature discrepancies boosts convergence in distributed learning. Finally, we propose a novel application of energy distance to assign penalty weights for aligning predictions across heterogeneous nodes, ultimately enhancing coordination in federated and distributed settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16174v1-abstract-full').style.display = 'none'; document.getElementById('2501.16174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15799">arXiv:2501.15799</a> <span> [<a href="https://arxiv.org/pdf/2501.15799">pdf</a>, <a href="https://arxiv.org/format/2501.15799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Can Molecular Evolution Mechanism Enhance Molecular Representation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Longtao Hu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiantao Cai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jia Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenbin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15799v1-abstract-short" style="display: inline;"> Molecular evolution is the process of simulating the natural evolution of molecules in chemical space to explore potential molecular structures and properties. The relationships between similar molecules are often described through transformations such as adding, deleting, and modifying atoms and chemical bonds, reflecting specific evolutionary paths. Existing molecular representation methods main… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15799v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15799v1-abstract-full" style="display: none;"> Molecular evolution is the process of simulating the natural evolution of molecules in chemical space to explore potential molecular structures and properties. The relationships between similar molecules are often described through transformations such as adding, deleting, and modifying atoms and chemical bonds, reflecting specific evolutionary paths. Existing molecular representation methods mainly focus on mining data, such as atomic-level structures and chemical bonds directly from the molecules, often overlooking their evolutionary history. Consequently, we aim to explore the possibility of enhancing molecular representations by simulating the evolutionary process. We extract and analyze the changes in the evolutionary pathway and explore combining it with existing molecular representations. Therefore, this paper proposes the molecular evolutionary network (MEvoN) for molecular representations. First, we construct the MEvoN using molecules with a small number of atoms and generate evolutionary paths utilizing similarity calculations. Then, by modeling the atomic-level changes, MEvoN reveals their impact on molecular properties. Experimental results show that the MEvoN-based molecular property prediction method significantly improves the performance of traditional end-to-end algorithms on several molecular datasets. The code is available at https://anonymous.4open.science/r/MEvoN-7416/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15799v1-abstract-full').style.display = 'none'; document.getElementById('2501.15799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15392">arXiv:2501.15392</a> <span> [<a href="https://arxiv.org/pdf/2501.15392">pdf</a>, <a href="https://arxiv.org/format/2501.15392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Faster Configuration Performance Bug Testing with Neural Dual-level Prioritization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Youpeng Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15392v2-abstract-short" style="display: inline;"> As software systems become more complex and configurable, more performance problems tend to arise from the configuration designs. This has caused some configuration options to unexpectedly degrade performance which deviates from their original expectations designed by the developers. Such discrepancies, namely configuration performance bugs (CPBugs), are devastating and can be deeply hidden in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15392v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15392v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15392v2-abstract-full" style="display: none;"> As software systems become more complex and configurable, more performance problems tend to arise from the configuration designs. This has caused some configuration options to unexpectedly degrade performance which deviates from their original expectations designed by the developers. Such discrepancies, namely configuration performance bugs (CPBugs), are devastating and can be deeply hidden in the source code. Yet, efficiently testing CPBugs is difficult, not only due to the test oracle is hard to set, but also because the configuration measurement is expensive and there are simply too many possible configurations to test. As such, existing testing tools suffer from lengthy runtime or have been ineffective in detecting CPBugs when the budget is limited, compounded by inaccurate test oracle. In this paper, we seek to achieve significantly faster CPBug testing by neurally prioritizing the testing at both the configuration option and value range levels with automated oracle estimation. Our proposed tool, dubbed NDP, is a general framework that works with different heuristic generators. The idea is to leverage two neural language models: one to estimate the CPBug types that serve as the oracle while, more vitally, the other to infer the probabilities of an option being CPBug-related, based on which the options and the value ranges to be searched can be prioritized. Experiments on several widely-used systems of different versions reveal that NDP can, in general, better predict CPBug type in 87% cases and find more CPBugs with up to 88.88x testing efficiency speedup over the state-of-the-art tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15392v2-abstract-full').style.display = 'none'; document.getElementById('2501.15392v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15302">arXiv:2501.15302</a> <span> [<a href="https://arxiv.org/pdf/2501.15302">pdf</a>, <a href="https://arxiv.org/ps/2501.15302">ps</a>, <a href="https://arxiv.org/format/2501.15302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The ICME 2025 Audio Encoder Capability Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Dinkel%2C+H">Heinrich Dinkel</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qiong Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Helen Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yadong Niu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Si Cheng</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+X">Xiaofeng Xin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yujun Wang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+J">Jian Luan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15302v1-abstract-short" style="display: inline;"> This challenge aims to evaluate the capabilities of audio encoders, especially in the context of multi-task learning and real-world applications. Participants are invited to submit pre-trained audio encoders that map raw waveforms to continuous embeddings. These encoders will be tested across diverse tasks including speech, environmental sounds, and music, with a focus on real-world usability. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15302v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15302v1-abstract-full" style="display: none;"> This challenge aims to evaluate the capabilities of audio encoders, especially in the context of multi-task learning and real-world applications. Participants are invited to submit pre-trained audio encoders that map raw waveforms to continuous embeddings. These encoders will be tested across diverse tasks including speech, environmental sounds, and music, with a focus on real-world usability. The challenge features two tracks: Track A for parameterized evaluation, and Track B for parameter-free evaluation. This challenge provides a platform for evaluating and advancing the state-of-the-art in audio encoder design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15302v1-abstract-full').style.display = 'none'; document.getElementById('2501.15302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15167">arXiv:2501.15167</a> <span> [<a href="https://arxiv.org/pdf/2501.15167">pdf</a>, <a href="https://arxiv.org/format/2501.15167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Intent Understanding for Ambiguous Prompts through Human-Machine Co-Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yijin Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15167v1-abstract-short" style="display: inline;"> Modern image generation systems can produce high-quality visuals, yet user prompts often contain ambiguities, requiring multiple revisions. Existing methods struggle to address the nuanced needs of non-expert users. We propose Visual Co-Adaptation (VCA), a novel framework that iteratively refines prompts and aligns generated images with user preferences. VCA employs a fine-tuned language model wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15167v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15167v1-abstract-full" style="display: none;"> Modern image generation systems can produce high-quality visuals, yet user prompts often contain ambiguities, requiring multiple revisions. Existing methods struggle to address the nuanced needs of non-expert users. We propose Visual Co-Adaptation (VCA), a novel framework that iteratively refines prompts and aligns generated images with user preferences. VCA employs a fine-tuned language model with reinforcement learning and multi-turn dialogues for prompt disambiguation. Key components include the Incremental Context-Enhanced Dialogue Block for interactive clarification, the Semantic Exploration and Disambiguation Module (SESD) leveraging Retrieval-Augmented Generation (RAG) and CLIP scoring, and the Pixel Precision and Consistency Optimization Module (PPCO) for refining image details using Proximal Policy Optimization (PPO). A human-in-the-loop feedback mechanism further improves performance. Experiments show that VCA surpasses models like DALL-E 3 and Stable Diffusion, reducing dialogue rounds to 4.3, achieving a CLIP score of 0.92, and enhancing user satisfaction to 4.73/5. Additionally, we introduce a novel multi-round dialogue dataset with prompt-image pairs and user intent annotations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15167v1-abstract-full').style.display = 'none'; document.getElementById('2501.15167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15077">arXiv:2501.15077</a> <span> [<a href="https://arxiv.org/pdf/2501.15077">pdf</a>, <a href="https://arxiv.org/format/2501.15077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> NetChain: Authenticated Blockchain Top-k Graph Data Queries and its Application in Asset Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongguang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Saiyu Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15077v1-abstract-short" style="display: inline;"> As a valuable digital resource, graph data is an important data asset, which has been widely utilized across various fields to optimize decision-making and enable smarter solutions. To manage data assets, blockchain is widely used to enable data sharing and trading, but it cannot supply complex analytical queries. vChain was proposed to achieve verifiable boolean queries over blockchain by designi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15077v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15077v1-abstract-full" style="display: none;"> As a valuable digital resource, graph data is an important data asset, which has been widely utilized across various fields to optimize decision-making and enable smarter solutions. To manage data assets, blockchain is widely used to enable data sharing and trading, but it cannot supply complex analytical queries. vChain was proposed to achieve verifiable boolean queries over blockchain by designing an embedded authenticated data structure (ADS). However, for generating (non-)existence proofs, vChain suffers from expensive storage and computation costs in ADS construction, along with high communication and verification costs. In this paper, we propose a novel NetChain framework that enables efficient top-k queries over on-chain graph data with verifiability. Specifically, we design a novel authenticated two-layer index that supports (non-)existence proof generation in block-level and built-in verifiability for matched objects. To further alleviate the computation and verification overhead, an optimized variant NetChain+ is derived. The authenticity of our frameworks is validated through security analysis. Evaluations show that NetChain and NetChain+ outperform vChain, respectively achieving up to 85X and 31X improvements on ADS construction. Moreover, compared with vChain, NetChain+ reduces the communication and verification costs by 87% and 96% respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15077v1-abstract-full').style.display = 'none'; document.getElementById('2501.15077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15043">arXiv:2501.15043</a> <span> [<a href="https://arxiv.org/pdf/2501.15043">pdf</a>, <a href="https://arxiv.org/format/2501.15043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Aware Controllable Shadow Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kerui Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiliang Wu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+W">Wenjin Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hehe Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15043v2-abstract-short" style="display: inline;"> Shadow removal aims to restore the image content in shadowed regions. While deep learning-based methods have shown promising results, they still face key challenges: 1) uncontrolled removal of all shadows, or 2) controllable removal but heavily relies on precise shadow region masks. To address these issues, we introduce a novel paradigm: prompt-aware controllable shadow removal. Unlike existing ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15043v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15043v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15043v2-abstract-full" style="display: none;"> Shadow removal aims to restore the image content in shadowed regions. While deep learning-based methods have shown promising results, they still face key challenges: 1) uncontrolled removal of all shadows, or 2) controllable removal but heavily relies on precise shadow region masks. To address these issues, we introduce a novel paradigm: prompt-aware controllable shadow removal. Unlike existing approaches, our paradigm allows for targeted shadow removal from specific subjects based on user prompts (e.g., dots, lines, or subject masks). This approach eliminates the need for shadow annotations and offers flexible, user-controlled shadow removal. Specifically, we propose an end-to-end learnable model, the Prompt-Aware Controllable Shadow Removal Network (PACSRNet). PACSRNet consists of two key modules: a prompt-aware module that generates shadow masks for the specified subject based on the user prompt, and a shadow removal module that uses the shadow prior from the first module to restore the content in the shadowed regions. Additionally, we enhance the shadow removal module by incorporating feature information from the prompt-aware module through a linear operation, providing prompt-guided support for shadow removal. Recognizing that existing shadow removal datasets lack diverse user prompts, we contribute a new dataset specifically designed for prompt-based controllable shadow removal. Extensive experimental results demonstrate the effectiveness and superiority of PACSRNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15043v2-abstract-full').style.display = 'none'; document.getElementById('2501.15043v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15040">arXiv:2501.15040</a> <span> [<a href="https://arxiv.org/pdf/2501.15040">pdf</a>, <a href="https://arxiv.org/format/2501.15040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Complementary Subspace Low-Rank Adaptation of Vision-Language Models for Few-Shot Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongqi Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jia Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanmeng Guo</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+M">Maosheng Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15040v1-abstract-short" style="display: inline;"> Vision language model (VLM) has been designed for large scale image-text alignment as a pretrained foundation model. For downstream few shot classification tasks, parameter efficient fine-tuning (PEFT) VLM has gained much popularity in the computer vision community. PEFT methods like prompt tuning and linear adapter have been studied for fine-tuning VLM while low rank adaptation (LoRA) algorithm h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15040v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15040v1-abstract-full" style="display: none;"> Vision language model (VLM) has been designed for large scale image-text alignment as a pretrained foundation model. For downstream few shot classification tasks, parameter efficient fine-tuning (PEFT) VLM has gained much popularity in the computer vision community. PEFT methods like prompt tuning and linear adapter have been studied for fine-tuning VLM while low rank adaptation (LoRA) algorithm has rarely been considered for few shot fine-tuning VLM. The main obstacle to use LoRA for few shot fine-tuning is the catastrophic forgetting problem. Because the visual language alignment knowledge is important for the generality in few shot learning, whereas low rank adaptation interferes with the most informative direction of the pretrained weight matrix. We propose the complementary subspace low rank adaptation (Comp-LoRA) method to regularize the catastrophic forgetting problem in few shot VLM finetuning. In detail, we optimize the low rank matrix in the complementary subspace, thus preserving the general vision language alignment ability of VLM when learning the novel few shot information. We conduct comparison experiments of the proposed Comp-LoRA method and other PEFT methods on fine-tuning VLM for few shot classification. And we also present the suppression on the catastrophic forgetting problem of our proposed method against directly applying LoRA to VLM. The results show that the proposed method surpasses the baseline method by about +1.0\% Top-1 accuracy and preserves the VLM zero-shot performance over the baseline method by about +1.3\% Top-1 accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15040v1-abstract-full').style.display = 'none'; document.getElementById('2501.15040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14082">arXiv:2501.14082</a> <span> [<a href="https://arxiv.org/pdf/2501.14082">pdf</a>, <a href="https://arxiv.org/format/2501.14082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Communicating Activations Between Language Model Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramesh%2C+V">Vignav Ramesh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kenneth Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14082v1-abstract-short" style="display: inline;"> Communication between multiple language model (LM) agents has been shown to scale up the reasoning ability of LMs. While natural language has been the dominant medium for inter-LM communication, it is not obvious this should be the standard: not only does natural language communication incur high inference costs that scale quickly with the number of both agents and messages, but also the decoding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14082v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14082v1-abstract-full" style="display: none;"> Communication between multiple language model (LM) agents has been shown to scale up the reasoning ability of LMs. While natural language has been the dominant medium for inter-LM communication, it is not obvious this should be the standard: not only does natural language communication incur high inference costs that scale quickly with the number of both agents and messages, but also the decoding process abstracts away too much rich information that could be otherwise accessed from the internal activations. In this work, we propose a simple technique whereby LMs communicate via activations; concretely, we pause an LM $\textit{B}$'s computation at an intermediate layer, combine its current activation with another LM $\textit{A}$'s intermediate activation via some function $\textit{f}$, then pass $\textit{f}$'s output into the next layer of $\textit{B}$ and continue the forward pass till decoding is complete. This approach scales up LMs on new tasks with zero additional parameters and data, and saves a substantial amount of compute over natural language communication. We test our method with various functional forms $\textit{f}$ on two experimental setups--multi-player coordination games and reasoning benchmarks--and find that it achieves up to $27.0\%$ improvement over natural language communication across datasets with $<$$1/4$ the compute, illustrating the superiority and robustness of activations as an alternative "language" for communication between LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14082v1-abstract-full').style.display = 'none'; document.getElementById('2501.14082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13106">arXiv:2501.13106</a> <span> [<a href="https://arxiv.org/pdf/2501.13106">pdf</a>, <a href="https://arxiv.org/format/2501.13106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kehan Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zesen Cheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiqiang Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanzheng Chen</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+S">Sicong Leng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peng Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13106v3-abstract-short" style="display: inline;"> In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation model for image and video understanding. The core design philosophy of VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the vision-centric training paradigm and vision-centric framework design. The key insight of our vision-centric training paradigm is that high-quality image-text data is crucia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13106v3-abstract-full').style.display = 'inline'; document.getElementById('2501.13106v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13106v3-abstract-full" style="display: none;"> In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation model for image and video understanding. The core design philosophy of VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the vision-centric training paradigm and vision-centric framework design. The key insight of our vision-centric training paradigm is that high-quality image-text data is crucial for both image and video understanding. Instead of preparing massive video-text datasets, we focus on constructing large-scale and high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) Vision Encoder Adaptation, which enables vision encoder to accept images of variable resolutions as input; 2) Vision-Language Alignment, which jointly tunes the vision encoder, projector, and LLM with large-scale image-text data covering multiple types (including scene images, documents, charts) as well as text-only data. 3) Multi-task Fine-tuning, which incorporates image-text SFT data for downstream tasks and video-text data to establish a foundation for video understanding. 4) Video-centric Fine-tuning, which further improves the model's capability in video understanding. As for the framework design, to better capture fine-grained details in images, the pretrained vision encoder is adapted to encode images of varying sizes into vision tokens with corresponding numbers, rather than a fixed number of tokens. For video inputs, we reduce the number of vision tokens according to their similarity so that the representation of videos will be more precise and compact. Benefit from vision-centric designs, VideoLLaMA3 achieves compelling performances in both image and video understanding benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13106v3-abstract-full').style.display = 'none'; document.getElementById('2501.13106v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12931">arXiv:2501.12931</a> <span> [<a href="https://arxiv.org/pdf/2501.12931">pdf</a>, <a href="https://arxiv.org/format/2501.12931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DynamicEarth: How Far are We from Open-Vocabulary Change Detection? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiyu Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yupeng Deng</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+C">Chao Pang</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Z">Zepeng Xin</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12931v1-abstract-short" style="display: inline;"> Monitoring Earth's evolving land covers requires methods capable of detecting changes across a wide range of categories and contexts. Existing change detection methods are hindered by their dependency on predefined classes, reducing their effectiveness in open-world applications. To address this issue, we introduce open-vocabulary change detection (OVCD), a novel task that bridges vision and langu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12931v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12931v1-abstract-full" style="display: none;"> Monitoring Earth's evolving land covers requires methods capable of detecting changes across a wide range of categories and contexts. Existing change detection methods are hindered by their dependency on predefined classes, reducing their effectiveness in open-world applications. To address this issue, we introduce open-vocabulary change detection (OVCD), a novel task that bridges vision and language to detect changes across any category. Considering the lack of high-quality data and annotation, we propose two training-free frameworks, M-C-I and I-M-C, which leverage and integrate off-the-shelf foundation models for the OVCD task. The insight behind the M-C-I framework is to discover all potential changes and then classify these changes, while the insight of I-M-C framework is to identify all targets of interest and then determine whether their states have changed. Based on these two frameworks, we instantiate to obtain several methods, e.g., SAM-DINOv2-SegEarth-OV, Grounding-DINO-SAM2-DINO, etc. Extensive evaluations on 5 benchmark datasets demonstrate the superior generalization and robustness of our OVCD methods over existing supervised and unsupervised methods. To support continued exploration, we release DynamicEarth, a dedicated codebase designed to advance research and application of OVCD. https://likyoo.github.io/DynamicEarth <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12931v1-abstract-full').style.display = 'none'; document.getElementById('2501.12931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12881">arXiv:2501.12881</a> <span> [<a href="https://arxiv.org/pdf/2501.12881">pdf</a>, <a href="https://arxiv.org/format/2501.12881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement learning Based Automated Design of Differential Evolution Algorithm for Black-box Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiwen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Ling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12881v1-abstract-short" style="display: inline;"> Differential evolution (DE) algorithm is recognized as one of the most effective evolutionary algorithms, demonstrating remarkable efficacy in black-box optimization due to its derivative-free nature. Numerous enhancements to the fundamental DE have been proposed, incorporating innovative mutation strategies and sophisticated parameter tuning techniques to improve performance. However, no single v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12881v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12881v1-abstract-full" style="display: none;"> Differential evolution (DE) algorithm is recognized as one of the most effective evolutionary algorithms, demonstrating remarkable efficacy in black-box optimization due to its derivative-free nature. Numerous enhancements to the fundamental DE have been proposed, incorporating innovative mutation strategies and sophisticated parameter tuning techniques to improve performance. However, no single variant has proven universally superior across all problems. To address this challenge, we introduce a novel framework that employs reinforcement learning (RL) to automatically design DE for black-box optimization through meta-learning. RL acts as an advanced meta-optimizer, generating a customized DE configuration that includes an optimal initialization strategy, update rule, and hyperparameters tailored to a specific black-box optimization problem. This process is informed by a detailed analysis of the problem characteristics. In this proof-of-concept study, we utilize a double deep Q-network for implementation, considering a subset of 40 possible strategy combinations and parameter optimizations simultaneously. The framework's performance is evaluated against black-box optimization benchmarks and compared with state-of-the-art algorithms. The experimental results highlight the promising potential of our proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12881v1-abstract-full').style.display = 'none'; document.getElementById('2501.12881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12619">arXiv:2501.12619</a> <span> [<a href="https://arxiv.org/pdf/2501.12619">pdf</a>, <a href="https://arxiv.org/format/2501.12619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Distillation Quantification for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sunbowen Lee</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junting Zhou</a>, <a href="/search/cs?searchtype=author&query=Ao%2C+C">Chang Ao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaige Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Sirui He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zhoufutu Wen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+S">Shiwen Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12619v2-abstract-short" style="display: inline;"> Model distillation is a technique for transferring knowledge from large language models (LLMs) to smaller ones, aiming to create resource-efficient yet high-performing models. However, excessive distillation can lead to homogenization, reducing diversity among models and impairing their ability to robustly handle complex or novel tasks. These limitations underscore the need to systematically quant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12619v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12619v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12619v2-abstract-full" style="display: none;"> Model distillation is a technique for transferring knowledge from large language models (LLMs) to smaller ones, aiming to create resource-efficient yet high-performing models. However, excessive distillation can lead to homogenization, reducing diversity among models and impairing their ability to robustly handle complex or novel tasks. These limitations underscore the need to systematically quantify the distillation process and its impact. In this work, we propose a framework to evaluate and quantify model distillation. Our method addresses two key aspects: (1) Identifying identity cognition contradictions to assess discrepancies in how models perceive and represent identity-related information, and (2) Analyzing multi-granularity response similarities across models to measure the extent of homogenization. Experimental results demonstrate two key insights: (1) Well-known closed-source and open-source LLMs usually exhibit high distillation degrees, except for Claude, Doubao, and Gemini. (2) Base LLMs show higher distillation degrees compared to aligned LLMs. By offering a systematic approach to improve the transparency of LLM data distillation, we call for LLMs with more independent development and more transparent technical reports to improve LLMs' robustness and safety. The code and data are available under https://github.com/Aegis1863/LLMs-Distillation-Quantification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12619v2-abstract-full').style.display = 'none'; document.getElementById('2501.12619v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12501">arXiv:2501.12501</a> <span> [<a href="https://arxiv.org/pdf/2501.12501">pdf</a>, <a href="https://arxiv.org/format/2501.12501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A Domain Adaptation Framework for Speech Recognition Systems with Only Synthetic data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tran%2C+M">Minh Tran</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yutong Pang</a>, <a href="/search/cs?searchtype=author&query=Paul%2C+D">Debjyoti Paul</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+L">Laxmi Pandey</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kevin Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinxi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuedong Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+X">Xin Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12501v1-abstract-short" style="display: inline;"> We introduce DAS (Domain Adaptation with Synthetic data), a novel domain adaptation framework for pre-trained ASR model, designed to efficiently adapt to various language-defined domains without requiring any real data. In particular, DAS first prompts large language models (LLMs) to generate domain-specific texts before converting these texts to speech via text-to-speech technology. The synthetic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12501v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12501v1-abstract-full" style="display: none;"> We introduce DAS (Domain Adaptation with Synthetic data), a novel domain adaptation framework for pre-trained ASR model, designed to efficiently adapt to various language-defined domains without requiring any real data. In particular, DAS first prompts large language models (LLMs) to generate domain-specific texts before converting these texts to speech via text-to-speech technology. The synthetic data is used to fine-tune Whisper with Low-Rank Adapters (LoRAs) for targeted domains such as music, weather, and sports. We introduce a novel one-pass decoding strategy that merges predictions from multiple LoRA adapters efficiently during the auto-regressive text generation process. Experimental results show significant improvements, reducing the Word Error Rate (WER) by 10% to 17% across all target domains compared to the original model, with minimal performance regression in out-of-domain settings (e.g., -1% on Librispeech test sets). We also demonstrate that DAS operates efficiently during inference, introducing an additional 9% increase in Real Time Factor (RTF) compared to the original model when inferring with three LoRA adapters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12501v1-abstract-full').style.display = 'none'; document.getElementById('2501.12501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12326">arXiv:2501.12326</a> <span> [<a href="https://arxiv.org/pdf/2501.12326">pdf</a>, <a href="https://arxiv.org/format/2501.12326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> UI-TARS: Pioneering Automated GUI Interaction with Native Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yujia Qin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yining Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junjie Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoming Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shihao Liang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Shizuo Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junda Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunxin Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shijue Huang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wanjun Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kuanye Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiale Yang</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yu Miao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Woyu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xu Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kai Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuang Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yaowei Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chaolin Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12326v1-abstract-short" style="display: inline;"> This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12326v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12326v1-abstract-full" style="display: none;"> This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks. Experiments demonstrate its superior performance: UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating perception, grounding, and GUI task execution. Notably, in the OSWorld benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15 steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld, UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of GUI screenshots for context-aware understanding of UI elements and precise captioning; (2) Unified Action Modeling, which standardizes actions into a unified space across platforms and achieves precise grounding and interaction through large-scale action traces; (3) System-2 Reasoning, which incorporates deliberate reasoning into multi-step decision making, involving multiple reasoning patterns such as task decomposition, reflection thinking, milestone recognition, etc. (4) Iterative Training with Reflective Online Traces, which addresses the data bottleneck by automatically collecting, filtering, and reflectively refining new interaction traces on hundreds of virtual machines. Through iterative training and reflection tuning, UI-TARS continuously learns from its mistakes and adapts to unforeseen situations with minimal human intervention. We also analyze the evolution path of GUI agents to guide the further development of this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12326v1-abstract-full').style.display = 'none'; document.getElementById('2501.12326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09767">arXiv:2501.09767</a> <span> [<a href="https://arxiv.org/pdf/2501.09767">pdf</a>, <a href="https://arxiv.org/format/2501.09767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LeMo: Enabling LEss Token Involvement for MOre Context Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tuowei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Ting Cao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Ju Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yaoxue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09767v1-abstract-short" style="display: inline;"> The escalating demand for long-context applications has intensified the necessity of extending the LLM context windows. Despite recent fine-tuning approaches successfully expanding context lengths, their high memory footprints, especially for activations, present a critical practical limitation. Current parameter-efficient fine-tuning methods prioritize reducing parameter update overhead over addr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09767v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09767v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09767v1-abstract-full" style="display: none;"> The escalating demand for long-context applications has intensified the necessity of extending the LLM context windows. Despite recent fine-tuning approaches successfully expanding context lengths, their high memory footprints, especially for activations, present a critical practical limitation. Current parameter-efficient fine-tuning methods prioritize reducing parameter update overhead over addressing activation memory constraints. Similarly, existing sparsity mechanisms improve computational efficiency but overlook activation memory optimization due to the phenomenon of Shadowy Activation. In this paper, we propose LeMo, the first LLM fine-tuning system that explores and exploits a new token-level sparsity mechanism inherent in long-context scenarios, termed Contextual Token Sparsity. LeMo minimizes redundant token involvement by assessing the informativeness of token embeddings while preserving model accuracy. Specifically, LeMo introduces three key techniques: (1) Token Elimination, dynamically identifying and excluding redundant tokens across varying inputs and layers. (2) Pattern Prediction, utilizing well-trained predictors to approximate token sparsity patterns with minimal overhead. (3) Kernel Optimization, employing permutation-free and segment-based strategies to boost system performance. We implement LeMo as an end-to-end fine-tuning system compatible with various LLM architectures and other optimization techniques. Comprehensive evaluations demonstrate that LeMo reduces memory consumption by up to 1.93x and achieves up to 1.36x speedups, outperforming state-of-the-art fine-tuning systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09767v1-abstract-full').style.display = 'none'; document.getElementById('2501.09767v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08570">arXiv:2501.08570</a> <span> [<a href="https://arxiv.org/pdf/2501.08570">pdf</a>, <a href="https://arxiv.org/format/2501.08570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Information Entropy Invariance: Enhancing Length Extrapolation in Attention Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kewei Li</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Yanwen Kong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiping Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jianlin Su</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruochi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fengfeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08570v3-abstract-short" style="display: inline;"> Since the emergence of research on improving the length extrapolation capabilities of large language models in 2021, some studies have made modifications to the scaling factor in the scaled dot-product attention mechanism as part of their proposed methods without rigorous theoretical justifications. To fill this gap, we propose two new scaled temperatures based on information entropy invariance to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08570v3-abstract-full').style.display = 'inline'; document.getElementById('2501.08570v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08570v3-abstract-full" style="display: none;"> Since the emergence of research on improving the length extrapolation capabilities of large language models in 2021, some studies have made modifications to the scaling factor in the scaled dot-product attention mechanism as part of their proposed methods without rigorous theoretical justifications. To fill this gap, we propose two new scaled temperatures based on information entropy invariance to enhance length extrapolation. First, a training-free method InfoScale is designed for dotproduct attention, and preserves focus on original tokens during length extrapolation by ensuring consistent entropy. Second, we theoretically analyze the impact of scaling (CosScale) on cosine attention. Experimental data demonstrates that combining InfoScale and CosScale achieves state-ofthe-art performance on the GAU-伪 model with a context window extended to 64 times the training length, and outperforms seven existing methods. Our analysis reveals that significantly increasing CosScale approximates the Windowed Attention, and highlights the significance of attention score dilution as a key challenge in long-range context handling. The code and data are available at https://github.com/HT-NEKO/ Information-Entropy-Invariance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08570v3-abstract-full').style.display = 'none'; document.getElementById('2501.08570v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08518">arXiv:2501.08518</a> <span> [<a href="https://arxiv.org/pdf/2501.08518">pdf</a>, <a href="https://arxiv.org/format/2501.08518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Easing Seasickness through Attention Redirection with a Mindfulness-Based Brain--Computer Interface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+X">Xiaoyu Bao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kailin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haiyun Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kangning Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qiyun Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanqing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08518v1-abstract-short" style="display: inline;"> Seasickness is a prevalent issue that adversely impacts both passenger experiences and the operational efficiency of maritime crews. While techniques that redirect attention have proven effective in alleviating motion sickness symptoms in terrestrial environments, applying similar strategies to manage seasickness poses unique challenges due to the prolonged and intense motion environment associate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08518v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08518v1-abstract-full" style="display: none;"> Seasickness is a prevalent issue that adversely impacts both passenger experiences and the operational efficiency of maritime crews. While techniques that redirect attention have proven effective in alleviating motion sickness symptoms in terrestrial environments, applying similar strategies to manage seasickness poses unique challenges due to the prolonged and intense motion environment associated with maritime travel. In this study, we propose a mindfulness brain-computer interface (BCI), specifically designed to redirect attention with the aim of mitigating seasickness symptoms in real-world settings. Our system utilizes a single-channel headband to capture prefrontal EEG signals, which are then wirelessly transmitted to computing devices for the assessment of mindfulness states. The results are transferred into real-time feedback as mindfulness scores and audiovisual stimuli, facilitating a shift in attentional focus from physiological discomfort to mindfulness practices. A total of 43 individuals participated in a real-world maritime experiment consisted of three sessions: a real-feedback mindfulness session, a resting session, and a pseudofeedback mindfulness session. Notably, 81.39% of participants reported that the mindfulness BCI intervention was effective, and there was a significant reduction in the severity of seasickness, as measured by the Misery Scale (MISC). Furthermore, EEG analysis revealed a decrease in the theta/beta ratio, corresponding with the alleviation of seasickness symptoms. A decrease in overall EEG band power during the real-feedback mindfulness session suggests that the mindfulness BCI fosters a more tranquil and downregulated state of brain activity. Together, this study presents a novel nonpharmacological, portable, and effective approach for seasickness intervention, with the potential to enhance the cruising experience for both passengers and crews. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08518v1-abstract-full').style.display = 'none'; document.getElementById('2501.08518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07805">arXiv:2501.07805</a> <span> [<a href="https://arxiv.org/pdf/2501.07805">pdf</a>, <a href="https://arxiv.org/format/2501.07805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> How Far are App Secrets from Being Stolen? A Case Study on Android </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+L">Lili Wei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heqing Huang</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+S">Shing-Chi Cheung</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kevin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07805v1-abstract-short" style="display: inline;"> Android apps can hold secret strings of themselves such as cloud service credentials or encryption keys. Leakage of such secret strings can induce unprecedented consequences like monetary losses or leakage of user private information. In practice, various security issues were reported because many apps failed to protect their secrets. However, little is known about the types, usages, exploitabilit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07805v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07805v1-abstract-full" style="display: none;"> Android apps can hold secret strings of themselves such as cloud service credentials or encryption keys. Leakage of such secret strings can induce unprecedented consequences like monetary losses or leakage of user private information. In practice, various security issues were reported because many apps failed to protect their secrets. However, little is known about the types, usages, exploitability, and consequences of app secret leakage issues. While a large body of literature has been devoted to studying user private information leakage, there is no systematic study characterizing app secret leakage issues. How far are Android app secrets from being stolen? To bridge this gap, we conducted the first systematic study to characterize app secret leakage issues in Android apps based on 575 potential app secrets sampled from 14,665 popular Android apps on Google Play. We summarized the common categories of leaked app secrets, assessed their security impacts and disclosed app bad practices in storing app secrets. We devised a text mining strategy using regular expressions and demonstrated that numerous app secrets can be easily stolen, even from the highly popular Android apps on Google. In a follow-up study, we harvested 3,711 distinct exploitable app secrets through automatic analysis. Our findings highlight the prevalence of this problem and call for greater attention to app secret protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07805v1-abstract-full').style.display = 'none'; document.getElementById('2501.07805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A pre-print version of the paper. It is accepted to Empirical Software Engineering (EMSE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05819">arXiv:2501.05819</a> <span> [<a href="https://arxiv.org/pdf/2501.05819">pdf</a>, <a href="https://arxiv.org/format/2501.05819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Models for Smarter UAVs: Decision-Making and Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emami%2C+Y">Yousef Emami</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&query=Almeida%2C+L">Luis Almeida</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05819v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicles (UAVs) are increasingly adopted in modern communication networks. However, challenges in decision-making and digital modeling continue to impede their rapid advancement. Reinforcement Learning (RL) algorithms face limitations such as low sample efficiency and limited data versatility, further magnified in UAV communication scenarios. Moreover, Digital Twin (DT) modeling in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05819v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05819v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicles (UAVs) are increasingly adopted in modern communication networks. However, challenges in decision-making and digital modeling continue to impede their rapid advancement. Reinforcement Learning (RL) algorithms face limitations such as low sample efficiency and limited data versatility, further magnified in UAV communication scenarios. Moreover, Digital Twin (DT) modeling introduces substantial decision-making and data management complexities. RL models, often integrated into DT frameworks, require extensive training data to achieve accurate predictions. In contrast to traditional approaches that focus on class boundaries, Diffusion Models (DMs), a new class of generative AI, learn the underlying probability distribution from the training data and can generate trustworthy new patterns based on this learned distribution. This paper explores the integration of DMs with RL and DT to effectively address these challenges. By combining the data generation capabilities of DMs with the decision-making framework of RL and the modeling accuracy of DT, the integration improves the adaptability and real-time performance of UAV communication. Moreover, the study shows how DMs can alleviate data scarcity, improve policy networks, and optimize dynamic modeling, providing a robust solution for complex UAV communication scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05819v1-abstract-full').style.display = 'none'; document.getElementById('2501.05819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 53-01 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.2; I.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05272">arXiv:2501.05272</a> <span> [<a href="https://arxiv.org/pdf/2501.05272">pdf</a>, <a href="https://arxiv.org/format/2501.05272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Solving the Catastrophic Forgetting Problem in Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xinzi Cao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiawu Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanhong Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weijiang Yu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yutong Lu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05272v1-abstract-short" style="display: inline;"> Generalized Category Discovery (GCD) aims to identify a mix of known and novel categories within unlabeled data sets, providing a more realistic setting for image recognition. Essentially, GCD needs to remember existing patterns thoroughly to recognize novel categories. Recent state-of-the-art method SimGCD transfers the knowledge from known-class data to the learning of novel classes through debi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05272v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05272v1-abstract-full" style="display: none;"> Generalized Category Discovery (GCD) aims to identify a mix of known and novel categories within unlabeled data sets, providing a more realistic setting for image recognition. Essentially, GCD needs to remember existing patterns thoroughly to recognize novel categories. Recent state-of-the-art method SimGCD transfers the knowledge from known-class data to the learning of novel classes through debiased learning. However, some patterns are catastrophically forgot during adaptation and thus lead to poor performance in novel categories classification. To address this issue, we propose a novel learning approach, LegoGCD, which is seamlessly integrated into previous methods to enhance the discrimination of novel classes while maintaining performance on previously encountered known classes. Specifically, we design two types of techniques termed as Local Entropy Regularization (LER) and Dual-views Kullback Leibler divergence constraint (DKL). The LER optimizes the distribution of potential known class samples in unlabeled data, thus ensuring the preservation of knowledge related to known categories while learning novel classes. Meanwhile, DKL introduces Kullback Leibler divergence to encourage the model to produce a similar prediction distribution of two view samples from the same image. In this way, it successfully avoids mismatched prediction and generates more reliable potential known class samples simultaneously. Extensive experiments validate that the proposed LegoGCD effectively addresses the known category forgetting issue across all datasets, eg, delivering a 7.74% and 2.51% accuracy boost on known and novel classes in CUB, respectively. Our code is available at: https://github.com/Cliffia123/LegoGCD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05272v1-abstract-full').style.display = 'none'; document.getElementById('2501.05272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04606">arXiv:2501.04606</a> <span> [<a href="https://arxiv.org/pdf/2501.04606">pdf</a>, <a href="https://arxiv.org/format/2501.04606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low-Cost Video Editing with Lightweight Adaptors and Temporal-Aware Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sida Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binxu Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04606v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image (T2I) generation using diffusion models have enabled cost-effective video-editing applications by leveraging pre-trained models, eliminating the need for resource-intensive training. However, the frame-independence of T2I generation often results in poor temporal consistency. Existing methods address this issue through temporal layer fine-tuning or inference-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04606v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04606v1-abstract-full" style="display: none;"> Recent advancements in text-to-image (T2I) generation using diffusion models have enabled cost-effective video-editing applications by leveraging pre-trained models, eliminating the need for resource-intensive training. However, the frame-independence of T2I generation often results in poor temporal consistency. Existing methods address this issue through temporal layer fine-tuning or inference-based temporal propagation, but these approaches suffer from high training costs or limited temporal coherence. To address these challenges, we propose a General and Efficient Adapter (GE-Adapter) that integrates temporal-spatial and semantic consistency with Baliteral DDIM inversion. This framework introduces three key components: (1) Frame-based Temporal Consistency Blocks (FTC Blocks) to capture frame-specific features and enforce smooth inter-frame transitions via temporally-aware loss functions; (2) Channel-dependent Spatial Consistency Blocks (SCD Blocks) employing bilateral filters to enhance spatial coherence by reducing noise and artifacts; and (3) Token-based Semantic Consistency Module (TSC Module) to maintain semantic alignment using shared prompt tokens and frame-specific tokens. Our method significantly improves perceptual quality, text-image alignment, and temporal coherence, as demonstrated on the MSR-VTT dataset. Additionally, it achieves enhanced fidelity and frame-to-frame coherence, offering a practical solution for T2V editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04606v1-abstract-full').style.display = 'none'; document.getElementById('2501.04606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03223">arXiv:2501.03223</a> <span> [<a href="https://arxiv.org/pdf/2501.03223">pdf</a>, <a href="https://arxiv.org/format/2501.03223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rate-My-LoRA: Efficient and Adaptive Federated Model Tuning for Cardiac MRI Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxiao He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haizhou Shi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaowei Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zihao Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Meng Ye</a>, <a href="/search/cs?searchtype=author&query=Axel%2C+L">Leon Axel</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03223v1-abstract-short" style="display: inline;"> Cardiovascular disease (CVD) and cardiac dyssynchrony are major public health problems in the United States. Precise cardiac image segmentation is crucial for extracting quantitative measures that help categorize cardiac dyssynchrony. However, achieving high accuracy often depends on centralizing large datasets from different hospitals, which can be challenging due to privacy concerns. To solve th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03223v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03223v1-abstract-full" style="display: none;"> Cardiovascular disease (CVD) and cardiac dyssynchrony are major public health problems in the United States. Precise cardiac image segmentation is crucial for extracting quantitative measures that help categorize cardiac dyssynchrony. However, achieving high accuracy often depends on centralizing large datasets from different hospitals, which can be challenging due to privacy concerns. To solve this problem, Federated Learning (FL) is proposed to enable decentralized model training on such data without exchanging sensitive information. However, bandwidth limitations and data heterogeneity remain as significant challenges in conventional FL algorithms. In this paper, we propose a novel efficient and adaptive federate learning method for cardiac segmentation that improves model performance while reducing the bandwidth requirement. Our method leverages the low-rank adaptation (LoRA) to regularize model weight update and reduce communication overhead. We also propose a \mymethod{} aggregation technique to address data heterogeneity among clients. This technique adaptively penalizes the aggregated weights from different clients by comparing the validation accuracy in each client, allowing better generalization performance and fast local adaptation. In-client and cross-client evaluations on public cardiac MR datasets demonstrate the superiority of our method over other LoRA-based federate learning approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03223v1-abstract-full').style.display = 'none'; document.getElementById('2501.03223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ISBI 2025</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>