Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 974 results for author: <span class="mathjax">He, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=He%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="He, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=He%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="He, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=He%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20582">arXiv:2502.20582</a> <span> [<a href="https://arxiv.org/pdf/2502.20582">pdf</a>, <a href="https://arxiv.org/format/2502.20582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> CS-PaperSum: A Large-Scale Dataset of AI-Generated Summaries for Scientific Papers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Javin Liu</a>, <a href="/search/cs?searchtype=author&query=Vats%2C+A">Aryan Vats</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zihao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20582v1-abstract-short" style="display: inline;"> The rapid expansion of scientific literature in computer science presents challenges in tracking research trends and extracting key insights. Existing datasets provide metadata but lack structured summaries that capture core contributions and methodologies. We introduce CS-PaperSum, a large-scale dataset of 91,919 papers from 31 top-tier computer science conferences, enriched with AI-generated str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20582v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20582v1-abstract-full" style="display: none;"> The rapid expansion of scientific literature in computer science presents challenges in tracking research trends and extracting key insights. Existing datasets provide metadata but lack structured summaries that capture core contributions and methodologies. We introduce CS-PaperSum, a large-scale dataset of 91,919 papers from 31 top-tier computer science conferences, enriched with AI-generated structured summaries using ChatGPT. To assess summary quality, we conduct embedding alignment analysis and keyword overlap analysis, demonstrating strong preservation of key concepts. We further present a case study on AI research trends, highlighting shifts in methodologies and interdisciplinary crossovers, including the rise of self-supervised learning, retrieval-augmented generation, and multimodal AI. Our dataset enables automated literature analysis, research trend forecasting, and AI-driven scientific discovery, providing a valuable resource for researchers, policymakers, and scientific information retrieval systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20582v1-abstract-full').style.display = 'none'; document.getElementById('2502.20582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20190">arXiv:2502.20190</a> <span> [<a href="https://arxiv.org/pdf/2502.20190">pdf</a>, <a href="https://arxiv.org/format/2502.20190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Highly Parallelized Reinforcement Learning Training with Relaxed Assignment Dependencies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhouyu He</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+P">Peng Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rongchun Li</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Y">Yong Dou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yusong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20190v1-abstract-short" style="display: inline;"> As the demands for superior agents grow, the training complexity of Deep Reinforcement Learning (DRL) becomes higher. Thus, accelerating training of DRL has become a major research focus. Dividing the DRL training process into subtasks and using parallel computation can effectively reduce training costs. However, current DRL training systems lack sufficient parallelization due to data assignment b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20190v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20190v1-abstract-full" style="display: none;"> As the demands for superior agents grow, the training complexity of Deep Reinforcement Learning (DRL) becomes higher. Thus, accelerating training of DRL has become a major research focus. Dividing the DRL training process into subtasks and using parallel computation can effectively reduce training costs. However, current DRL training systems lack sufficient parallelization due to data assignment between subtask components. This assignment issue has been ignored, but addressing it can further boost training efficiency. Therefore, we propose a high-throughput distributed RL training system called TianJi. It relaxes assignment dependencies between subtask components and enables event-driven asynchronous communication. Meanwhile, TianJi maintains clear boundaries between subtask components. To address convergence uncertainty from relaxed assignment dependencies, TianJi proposes a distributed strategy based on the balance of sample production and consumption. The strategy controls the staleness of samples to correct their quality, ensuring convergence. We conducted extensive experiments. TianJi achieves a convergence time acceleration ratio of up to 4.37 compared to related comparison systems. When scaled to eight computational nodes, TianJi shows a convergence time speedup of 1.6 and a throughput speedup of 7.13 relative to XingTian, demonstrating its capability to accelerate training and scalability. In data transmission efficiency experiments, TianJi significantly outperforms other systems, approaching hardware limits. TianJi also shows effectiveness in on-policy algorithms, achieving convergence time acceleration ratios of 4.36 and 2.95 compared to RLlib and XingTian. TianJi is accessible at https://github.com/HiPRL/TianJi.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20190v1-abstract-full').style.display = 'none'; document.getElementById('2502.20190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19058">arXiv:2502.19058</a> <span> [<a href="https://arxiv.org/pdf/2502.19058">pdf</a>, <a href="https://arxiv.org/format/2502.19058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MathClean: A Benchmark for Synthetic Mathematical Data Cleaning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+M">Meiyi Qiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuying Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zefeng He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yongzhen Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhengzhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Bin Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19058v1-abstract-short" style="display: inline;"> With the rapid development of large language models (LLMs), the quality of training data has become crucial. Among the various types of training data, mathematical data plays a key role in enabling LLMs to acquire strong reasoning abilities. While high-quality open-source data is important, it is often insufficient for pre-training, necessitating the addition of synthetic math problems. However, s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19058v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19058v1-abstract-full" style="display: none;"> With the rapid development of large language models (LLMs), the quality of training data has become crucial. Among the various types of training data, mathematical data plays a key role in enabling LLMs to acquire strong reasoning abilities. While high-quality open-source data is important, it is often insufficient for pre-training, necessitating the addition of synthetic math problems. However, synthetic math questions and answers can introduce inaccuracies, which may degrade both the training data and web data. Therefore, an effective method for cleaning synthetic math data is essential. In this paper, we propose the MathClean benchmark to evaluate the effectiveness of math data cleaning models. The MathClean benchmark consists of 2,000 correct questions and 2,000 erroneous questions with additional 2,000 correct and erroneous answers sourced from augmented data based on GSM8K and MATH. Moreover, we also annotate error types for each question or answer, since it can assess whether models can correctly identify the error categories for future improvements. Finally, we present comprehensive evaluations using state-of-the-art (SOTA) models. Our results demonstrate that even strong models like GPT-o1 and DeepSeek-R1 perform poorly on this benchmark, highlighting the utility of MathClean. Our code and data is available at https://github.com/YuYingLi0/MathClean. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19058v1-abstract-full').style.display = 'none'; document.getElementById('2502.19058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18511">arXiv:2502.18511</a> <span> [<a href="https://arxiv.org/pdf/2502.18511">pdf</a>, <a href="https://arxiv.org/format/2502.18511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ELBA-Bench: An Efficient Learning Backdoor Attacks Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuxu Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Mengya Han</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yong Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aishan Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiantao Cai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zheng He</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18511v1-abstract-short" style="display: inline;"> Generative large language models are crucial in natural language processing, but they are vulnerable to backdoor attacks, where subtle triggers compromise their behavior. Although backdoor attacks against LLMs are constantly emerging, existing benchmarks remain limited in terms of sufficient coverage of attack, metric system integrity, backdoor attack alignment. And existing pre-trained backdoor a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18511v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18511v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18511v1-abstract-full" style="display: none;"> Generative large language models are crucial in natural language processing, but they are vulnerable to backdoor attacks, where subtle triggers compromise their behavior. Although backdoor attacks against LLMs are constantly emerging, existing benchmarks remain limited in terms of sufficient coverage of attack, metric system integrity, backdoor attack alignment. And existing pre-trained backdoor attacks are idealized in practice due to resource access constraints. Therefore we establish $\textit{ELBA-Bench}$, a comprehensive and unified framework that allows attackers to inject backdoor through parameter efficient fine-tuning ($\textit{e.g.,}$ LoRA) or without fine-tuning techniques ($\textit{e.g.,}$ In-context-learning). $\textit{ELBA-Bench}$ provides over 1300 experiments encompassing the implementations of 12 attack methods, 18 datasets, and 12 LLMs. Extensive experiments provide new invaluable findings into the strengths and limitations of various attack strategies. For instance, PEFT attack consistently outperform without fine-tuning approaches in classification tasks while showing strong cross-dataset generalization with optimized triggers boosting robustness; Task-relevant backdoor optimization techniques or attack prompts along with clean and adversarial demonstrations can enhance backdoor attack success while preserving model performance on clean samples. Additionally, we introduce a universal toolbox designed for standardized backdoor attack research, with the goal of propelling further progress in this vital area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18511v1-abstract-full').style.display = 'none'; document.getElementById('2502.18511v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17529">arXiv:2502.17529</a> <span> [<a href="https://arxiv.org/pdf/2502.17529">pdf</a>, <a href="https://arxiv.org/format/2502.17529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ConvoyLLM: Dynamic Multi-Lane Convoy Control Using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+L">Liping Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhican He</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+D">Duanfeng Chu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rukang Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Saiqian Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17529v2-abstract-short" style="display: inline;"> This paper proposes a novel method for multi-lane convoy formation control that uses large language models (LLMs) to tackle coordination challenges in dynamic highway environments. Each connected and autonomous vehicle in the convoy uses a knowledge-driven approach to make real-time adaptive decisions based on various scenarios. Our method enables vehicles to dynamically perform tasks, including o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17529v2-abstract-full').style.display = 'inline'; document.getElementById('2502.17529v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17529v2-abstract-full" style="display: none;"> This paper proposes a novel method for multi-lane convoy formation control that uses large language models (LLMs) to tackle coordination challenges in dynamic highway environments. Each connected and autonomous vehicle in the convoy uses a knowledge-driven approach to make real-time adaptive decisions based on various scenarios. Our method enables vehicles to dynamically perform tasks, including obstacle avoidance, convoy joining/leaving, and escort formation switching, all while maintaining the overall convoy structure. We design a Interlaced formation control strategy based on locally dynamic distributed graphs, ensuring the convoy remains stable and flexible. We conduct extensive experiments in the SUMO simulation platform across multiple traffic scenarios, and the results demonstrate that the proposed method is effective, robust, and adaptable to dynamic environments. The code is available at: https://github.com/chuduanfeng/ConvoyLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17529v2-abstract-full').style.display = 'none'; document.getElementById('2502.17529v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16454">arXiv:2502.16454</a> <span> [<a href="https://arxiv.org/pdf/2502.16454">pdf</a>, <a href="https://arxiv.org/format/2502.16454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MAPN: Enhancing Heterogeneous Sparse Graph Representation by Mamba-based Asynchronous Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xuqi Mao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenying He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+S">X. Sean Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16454v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have become the state of the art for various graph-related tasks and are particularly prominent in heterogeneous graphs (HetGs). However, several issues plague this paradigm: first, the difficulty in fully utilizing long-range information, known as over-squashing; second, the tendency for excessive message-passing layers to produce indistinguishable representations, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16454v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16454v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16454v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have become the state of the art for various graph-related tasks and are particularly prominent in heterogeneous graphs (HetGs). However, several issues plague this paradigm: first, the difficulty in fully utilizing long-range information, known as over-squashing; second, the tendency for excessive message-passing layers to produce indistinguishable representations, referred to as over-smoothing; and finally, the inadequacy of conventional MPNNs to train effectively on large sparse graphs. To address these challenges in deep neural networks for large-scale heterogeneous graphs, this paper introduces the Mamba-based Asynchronous Propagation Network (MAPN), which enhances the representation of heterogeneous sparse graphs. MAPN consists of two primary components: node sequence generation and semantic information aggregation. Node sequences are initially generated based on meta-paths through random walks, which serve as the foundation for a spatial state model that extracts essential information from nodes at various distances. It then asynchronously aggregates semantic information across multiple hops and layers, effectively preserving unique node characteristics and mitigating issues related to deep network degradation. Extensive experiments across diverse datasets demonstrate the effectiveness of MAPN in graph embeddings for various downstream tasks underscoring its substantial benefits for graph representation in large sparse heterogeneous graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16454v1-abstract-full').style.display = 'none'; document.getElementById('2502.16454v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16288">arXiv:2502.16288</a> <span> [<a href="https://arxiv.org/pdf/2502.16288">pdf</a>, <a href="https://arxiv.org/ps/2502.16288">ps</a>, <a href="https://arxiv.org/format/2502.16288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11280-024-01303-1">10.1007/s11280-024-01303-1 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HetFS: A Method for Fast Similarity Search with Ad-hoc Meta-paths on Heterogeneous Information Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xuqi Mao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenyi Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenying He</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Y">Yinan Jing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+S">X. Sean Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16288v1-abstract-short" style="display: inline;"> Numerous real-world information networks form Heterogeneous Information Networks (HINs) with diverse objects and relations represented as nodes and edges in heterogeneous graphs. Similarity between nodes quantifies how closely two nodes resemble each other, mainly depending on the similarity of the nodes they are connected to, recursively. Users may be interested in only specific types of connecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16288v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16288v1-abstract-full" style="display: none;"> Numerous real-world information networks form Heterogeneous Information Networks (HINs) with diverse objects and relations represented as nodes and edges in heterogeneous graphs. Similarity between nodes quantifies how closely two nodes resemble each other, mainly depending on the similarity of the nodes they are connected to, recursively. Users may be interested in only specific types of connections in the similarity definition, represented as meta-paths, i.e., a sequence of node and edge types. Existing Heterogeneous Graph Neural Network (HGNN)-based similarity search methods may accommodate meta-paths, but require retraining for different meta-paths. Conversely, existing path-based similarity search methods may switch flexibly between meta-paths but often suffer from lower accuracy, as they rely solely on path information. This paper proposes HetFS, a Fast Similarity method for ad-hoc queries with user-given meta-paths on Heterogeneous information networks. HetFS provides similarity results based on path information that satisfies the meta-path restriction, as well as node content. Extensive experiments demonstrate the effectiveness and efficiency of HetFS in addressing ad-hoc queries, outperforming state-of-the-art HGNNs and path-based approaches, and showing strong performance in downstream applications, including link prediction, node classification, and clustering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16288v1-abstract-full').style.display = 'none'; document.getElementById('2502.16288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> World Wide Web Volume 27, article number 66, (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16281">arXiv:2502.16281</a> <span> [<a href="https://arxiv.org/pdf/2502.16281">pdf</a>, <a href="https://arxiv.org/ps/2502.16281">ps</a>, <a href="https://arxiv.org/format/2502.16281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FHGE: A Fast Heterogeneous Graph Embedding with Ad-hoc Meta-paths </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xuqi Mao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenying He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+S">X. Sean Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16281v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have emerged as the state of the art for a variety of graph-related tasks and have been widely used in Heterogeneous Graphs (HetGs), where meta-paths help encode specific semantics between various node types. Despite the revolutionary representation capabilities of existing heterogeneous GNNs (HGNNs) due to their focus on improving the effectiveness of heterogeneity ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16281v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16281v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have emerged as the state of the art for a variety of graph-related tasks and have been widely used in Heterogeneous Graphs (HetGs), where meta-paths help encode specific semantics between various node types. Despite the revolutionary representation capabilities of existing heterogeneous GNNs (HGNNs) due to their focus on improving the effectiveness of heterogeneity capturing, the huge training costs hinder their practical deployment in real-world scenarios that frequently require handling ad-hoc queries with user-defined meta-paths. To address this, we propose FHGE, a Fast Heterogeneous Graph Embedding designed for efficient, retraining-free generation of meta-path-guided graph embeddings. The key design of the proposed framework is two-fold: segmentation and reconstruction modules. It employs Meta-Path Units (MPUs) to segment the graph into local and global components, enabling swift integration of node embeddings from relevant MPUs during reconstruction and allowing quick adaptation to specific meta-paths. In addition, a dual attention mechanism is applied to enhance semantics capturing. Extensive experiments across diverse datasets demonstrate the effectiveness and efficiency of FHGE in generating meta-path-guided graph embeddings and downstream tasks, such as link prediction and node classification, highlighting its significant advantages for real-time graph analysis in ad-hoc queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16281v1-abstract-full').style.display = 'none'; document.getElementById('2502.16281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16115">arXiv:2502.16115</a> <span> [<a href="https://arxiv.org/pdf/2502.16115">pdf</a>, <a href="https://arxiv.org/format/2502.16115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Detecting OOD Samples via Optimal Transport Scoring Function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Heng Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuolin He</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+J">Jian Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16115v1-abstract-short" style="display: inline;"> To deploy machine learning models in the real world, researchers have proposed many OOD detection algorithms to help models identify unknown samples during the inference phase and prevent them from making untrustworthy predictions. Unlike methods that rely on extra data for outlier exposure training, post hoc methods detect Out-of-Distribution (OOD) samples by developing scoring functions, which a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16115v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16115v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16115v1-abstract-full" style="display: none;"> To deploy machine learning models in the real world, researchers have proposed many OOD detection algorithms to help models identify unknown samples during the inference phase and prevent them from making untrustworthy predictions. Unlike methods that rely on extra data for outlier exposure training, post hoc methods detect Out-of-Distribution (OOD) samples by developing scoring functions, which are model agnostic and do not require additional training. However, previous post hoc methods may fail to capture the geometric cues embedded in network representations. Thus, in this study, we propose a novel score function based on the optimal transport theory, named OTOD, for OOD detection. We utilize information from features, logits, and the softmax probability space to calculate the OOD score for each test sample. Our experiments show that combining this information can boost the performance of OTOD with a certain margin. Experiments on the CIFAR-10 and CIFAR-100 benchmarks demonstrate the superior performance of our method. Notably, OTOD outperforms the state-of-the-art method GEN by 7.19% in the mean FPR@95 on the CIFAR-10 benchmark using ResNet-18 as the backbone, and by 12.51% in the mean FPR@95 using WideResNet-28 as the backbone. In addition, we provide theoretical guarantees for OTOD. The code is available in https://github.com/HengGao12/OTOD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16115v1-abstract-full').style.display = 'none'; document.getElementById('2502.16115v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14938">arXiv:2502.14938</a> <span> [<a href="https://arxiv.org/pdf/2502.14938">pdf</a>, <a href="https://arxiv.org/format/2502.14938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GS-Cache: A GS-Cache Inference Framework for Large-scale Gaussian Splatting Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+M">Miao Tao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanzhen Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoran Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zeyu He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuchang Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhongling Su</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Linning Xu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhenxiang Ma</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+R">Rong Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hengjie Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+J">Jidong Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14938v1-abstract-short" style="display: inline;"> Rendering large-scale 3D Gaussian Splatting (3DGS) model faces significant challenges in achieving real-time, high-fidelity performance on consumer-grade devices. Fully realizing the potential of 3DGS in applications such as virtual reality (VR) requires addressing critical system-level challenges to support real-time, immersive experiences. We propose GS-Cache, an end-to-end framework that seamle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14938v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14938v1-abstract-full" style="display: none;"> Rendering large-scale 3D Gaussian Splatting (3DGS) model faces significant challenges in achieving real-time, high-fidelity performance on consumer-grade devices. Fully realizing the potential of 3DGS in applications such as virtual reality (VR) requires addressing critical system-level challenges to support real-time, immersive experiences. We propose GS-Cache, an end-to-end framework that seamlessly integrates 3DGS's advanced representation with a highly optimized rendering system. GS-Cache introduces a cache-centric pipeline to eliminate redundant computations, an efficiency-aware scheduler for elastic multi-GPU rendering, and optimized CUDA kernels to overcome computational bottlenecks. This synergy between 3DGS and system design enables GS-Cache to achieve up to 5.35x performance improvement, 35% latency reduction, and 42% lower GPU memory usage, supporting 2K binocular rendering at over 120 FPS with high visual quality. By bridging the gap between 3DGS's representation power and the demands of VR systems, GS-Cache establishes a scalable and efficient framework for real-time neural rendering in immersive environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14938v1-abstract-full').style.display = 'none'; document.getElementById('2502.14938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14496">arXiv:2502.14496</a> <span> [<a href="https://arxiv.org/pdf/2502.14496">pdf</a>, <a href="https://arxiv.org/format/2502.14496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Language Multi-Agent Learning with Multi-Agent Credit Re-Assignment for Interactive Environment Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhitao He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+M">May Fung</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14496v1-abstract-short" style="display: inline;"> LLM-based agents have made significant advancements in interactive environments, such as mobile operations and web browsing, and other domains beyond computer using. Current multi-agent systems universally excel in performance, compared to single agents, but struggle with generalization across environments due to predefined roles and inadequate strategies for generalizing language agents. The chal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14496v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14496v1-abstract-full" style="display: none;"> LLM-based agents have made significant advancements in interactive environments, such as mobile operations and web browsing, and other domains beyond computer using. Current multi-agent systems universally excel in performance, compared to single agents, but struggle with generalization across environments due to predefined roles and inadequate strategies for generalizing language agents. The challenge of achieving both strong performance and good generalization has hindered the progress of multi-agent systems for interactive environments. To address these issues, we propose CollabUIAgents, a multi-agent reinforcement learning framework with a novel multi-agent credit re-assignment (CR) strategy, assigning process rewards with LLMs rather than environment-specific rewards and learning with synthesized preference data, in order to foster generalizable, collaborative behaviors among the role-free agents' policies. Empirical results show that our framework improves both performance and cross-environment generalizability of multi-agent systems. Moreover, our 7B-parameter system achieves results on par with or exceed strong closed-source models, and the LLM that guides the CR. We also provide insights in using granular CR rewards effectively for environment generalization, and accommodating trained LLMs in multi-agent systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14496v1-abstract-full').style.display = 'none'; document.getElementById('2502.14496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13581">arXiv:2502.13581</a> <span> [<a href="https://arxiv.org/pdf/2502.13581">pdf</a>, <a href="https://arxiv.org/format/2502.13581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ActionPiece: Contextually Tokenizing Action Sequences for Generative Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yupeng Hou</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhankui He</a>, <a href="/search/cs?searchtype=author&query=Sachdeva%2C+N">Noveen Sachdeva</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+W">Wang-Cheng Kang</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+E+H">Ed H. Chi</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D+Z">Derek Zhiyuan Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13581v1-abstract-short" style="display: inline;"> Generative recommendation (GR) is an emerging paradigm where user actions are tokenized into discrete token patterns and autoregressively generated as predictions. However, existing GR models tokenize each action independently, assigning the same fixed tokens to identical actions across all sequences without considering contextual relationships. This lack of context-awareness can lead to suboptima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13581v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13581v1-abstract-full" style="display: none;"> Generative recommendation (GR) is an emerging paradigm where user actions are tokenized into discrete token patterns and autoregressively generated as predictions. However, existing GR models tokenize each action independently, assigning the same fixed tokens to identical actions across all sequences without considering contextual relationships. This lack of context-awareness can lead to suboptimal performance, as the same action may hold different meanings depending on its surrounding context. To address this issue, we propose ActionPiece to explicitly incorporate context when tokenizing action sequences. In ActionPiece, each action is represented as a set of item features, which serve as the initial tokens. Given the action sequence corpora, we construct the vocabulary by merging feature patterns as new tokens, based on their co-occurrence frequency both within individual sets and across adjacent sets. Considering the unordered nature of feature sets, we further introduce set permutation regularization, which produces multiple segmentations of action sequences with the same semantics. Experiments on public datasets demonstrate that ActionPiece consistently outperforms existing action tokenization methods, improving NDCG@$10$ by $6.00\%$ to $12.82\%$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13581v1-abstract-full').style.display = 'none'; document.getElementById('2502.13581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13514">arXiv:2502.13514</a> <span> [<a href="https://arxiv.org/pdf/2502.13514">pdf</a>, <a href="https://arxiv.org/format/2502.13514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Shall Your Data Strategy Work? Perform a Swift Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+M">Minlong Peng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingyi Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhongjun He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hua Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13514v1-abstract-short" style="display: inline;"> This work presents a swift method to assess the efficacy of particular types of instruction-tuning data, utilizing just a handful of probe examples and eliminating the need for model retraining. This method employs the idea of gradient-based data influence estimation, analyzing the gradient projections of probe examples from the chosen strategy onto evaluation examples to assess its advantages. Bu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13514v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13514v1-abstract-full" style="display: none;"> This work presents a swift method to assess the efficacy of particular types of instruction-tuning data, utilizing just a handful of probe examples and eliminating the need for model retraining. This method employs the idea of gradient-based data influence estimation, analyzing the gradient projections of probe examples from the chosen strategy onto evaluation examples to assess its advantages. Building upon this method, we conducted three swift studies to investigate the potential of Chain-of-thought (CoT) data, query clarification data, and response evaluation data in enhancing model generalization. Subsequently, we embarked on a validation study to corroborate the findings of these swift studies. In this validation study, we developed training datasets tailored to each studied strategy and compared model performance with and without the use of these datasets. The results of the validation study aligned with the findings of the swift studies, validating the efficacy of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13514v1-abstract-full').style.display = 'none'; document.getElementById('2502.13514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12875">arXiv:2502.12875</a> <span> [<a href="https://arxiv.org/pdf/2502.12875">pdf</a>, <a href="https://arxiv.org/format/2502.12875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on DRL based UAV Communications and Networking: DRL Fundamentals, Applications and Implementations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shaoxin Cui</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+W">Wen Qiu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiqiang He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhi Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiao Zheng</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+B">Bomin Mao</a>, <a href="/search/cs?searchtype=author&query=Kato%2C+N">Nei Kato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12875v1-abstract-short" style="display: inline;"> Unmanned aerial vehicles (UAVs) are playing an increasingly pivotal role in modern communication networks,offering flexibility and enhanced coverage for a variety of applica-tions. However, UAV networks pose significant challenges due to their dynamic and distributed nature, particularly when dealing with tasks such as power allocation, channel assignment, caching,and task offloading. Traditional… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12875v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12875v1-abstract-full" style="display: none;"> Unmanned aerial vehicles (UAVs) are playing an increasingly pivotal role in modern communication networks,offering flexibility and enhanced coverage for a variety of applica-tions. However, UAV networks pose significant challenges due to their dynamic and distributed nature, particularly when dealing with tasks such as power allocation, channel assignment, caching,and task offloading. Traditional optimization techniques often struggle to handle the complexity and unpredictability of these environments, leading to suboptimal performance. This survey provides a comprehensive examination of how deep reinforcement learning (DRL) can be applied to solve these mathematical optimization problems in UAV communications and networking.Rather than simply introducing DRL methods, the focus is on demonstrating how these methods can be utilized to solve complex mathematical models of the underlying problems. We begin by reviewing the fundamental concepts of DRL, including value-based, policy-based, and actor-critic approaches. Then,we illustrate how DRL algorithms are applied to specific UAV network tasks by discussing from problem formulations to DRL implementation. By framing UAV communication challenges as optimization problems, this survey emphasizes the practical value of DRL in dynamic and uncertain environments. We also explore the strengths of DRL in handling large-scale network scenarios and the ability to continuously adapt to changes in the environment. In addition, future research directions are outlined, highlighting the potential for DRL to further enhance UAV communications and expand its applicability to more complex,multi-agent settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12875v1-abstract-full').style.display = 'none'; document.getElementById('2502.12875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12751">arXiv:2502.12751</a> <span> [<a href="https://arxiv.org/pdf/2502.12751">pdf</a>, <a href="https://arxiv.org/format/2502.12751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Architect of the Bits World: Masked Autoregressive Modeling for Circuit Generation Guided by Truth Table </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haisheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shoubo Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuolun He</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12751v1-abstract-short" style="display: inline;"> Logic synthesis, a critical stage in electronic design automation (EDA), optimizes gate-level circuits to minimize power consumption and area occupancy in integrated circuits (ICs). Traditional logic synthesis tools rely on human-designed heuristics, often yielding suboptimal results. Although differentiable architecture search (DAS) has shown promise in generating circuits from truth tables, it f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12751v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12751v1-abstract-full" style="display: none;"> Logic synthesis, a critical stage in electronic design automation (EDA), optimizes gate-level circuits to minimize power consumption and area occupancy in integrated circuits (ICs). Traditional logic synthesis tools rely on human-designed heuristics, often yielding suboptimal results. Although differentiable architecture search (DAS) has shown promise in generating circuits from truth tables, it faces challenges such as high computational complexity, convergence to local optima, and extensive hyperparameter tuning. Consequently, we propose a novel approach integrating conditional generative models with DAS for circuit generation. Our approach first introduces CircuitVQ, a circuit tokenizer trained based on our Circuit AutoEncoder We then develop CircuitAR, a masked autoregressive model leveraging CircuitVQ as the tokenizer. CircuitAR can generate preliminary circuit structures from truth tables, which guide DAS in producing functionally equivalent circuits. Notably, we observe the scalability and emergent capability in generating complex circuit structures of our CircuitAR models. Extensive experiments also show the superior performance of our method. This research bridges the gap between probabilistic generative models and precise circuit generation, offering a robust solution for logic synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12751v1-abstract-full').style.display = 'none'; document.getElementById('2502.12751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12502">arXiv:2502.12502</a> <span> [<a href="https://arxiv.org/pdf/2502.12502">pdf</a>, <a href="https://arxiv.org/format/2502.12502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient OpAmp Adaptation for Zoom Attention to Golden Contexts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+R">Rui Ming</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haisheng Zheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuolun He</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12502v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown significant promise in question-answering (QA) tasks, particularly in retrieval-augmented generation (RAG) scenarios and long-context applications. However, their performance is hindered by noisy reference documents, which often distract from essential information. Despite fine-tuning efforts, Transformer-based architectures struggle to prioritize relevant c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12502v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12502v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown significant promise in question-answering (QA) tasks, particularly in retrieval-augmented generation (RAG) scenarios and long-context applications. However, their performance is hindered by noisy reference documents, which often distract from essential information. Despite fine-tuning efforts, Transformer-based architectures struggle to prioritize relevant content. This is evidenced by their tendency to allocate disproportionate attention to irrelevant or later-positioned documents. Recent work proposes the differential attention mechanism to address this issue, but this mechanism is limited by an unsuitable common-mode rejection ratio (CMRR) and high computational costs. Inspired by the operational amplifier (OpAmp), we propose the OpAmp adaptation to address these challenges, which is implemented with adapters efficiently. By integrating the adapter into pre-trained Transformer blocks, our approach enhances focus on the golden context without costly training from scratch. Empirical evaluations on noisy-context benchmarks reveal that our Qwen2.5-OpAmp-72B model, trained with our OpAmp adaptation, surpasses the performance of state-of-the-art LLMs, including DeepSeek-V3 and GPT-4o. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12502v1-abstract-full').style.display = 'none'; document.getElementById('2502.12502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11356">arXiv:2502.11356</a> <span> [<a href="https://arxiv.org/pdf/2502.11356">pdf</a>, <a href="https://arxiv.org/format/2502.11356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SAIF: A Sparse Autoencoder Framework for Interpreting and Steering Instruction Following of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zirui He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiyan Zhao</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yiran Qiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Payani%2C+A">Ali Payani</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+M">Mengnan Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11356v1-abstract-short" style="display: inline;"> The ability of large language models (LLMs) to follow instructions is crucial for their practical applications, yet the underlying mechanisms remain poorly understood. This paper presents a novel framework that leverages sparse autoencoders (SAE) to interpret how instruction following works in these models. We demonstrate how the features we identify can effectively steer model outputs to align wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11356v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11356v1-abstract-full" style="display: none;"> The ability of large language models (LLMs) to follow instructions is crucial for their practical applications, yet the underlying mechanisms remain poorly understood. This paper presents a novel framework that leverages sparse autoencoders (SAE) to interpret how instruction following works in these models. We demonstrate how the features we identify can effectively steer model outputs to align with given instructions. Through analysis of SAE latent activations, we identify specific latents responsible for instruction following behavior. Our findings reveal that instruction following capabilities are encoded by a distinct set of instruction-relevant SAE latents. These latents both show semantic proximity to relevant instructions and demonstrate causal effects on model behavior. Our research highlights several crucial factors for achieving effective steering performance: precise feature identification, the role of final layer, and optimal instruction positioning. Additionally, we demonstrate that our methodology scales effectively across SAEs and LLMs of varying sizes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11356v1-abstract-full').style.display = 'none'; document.getElementById('2502.11356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 11 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11306">arXiv:2502.11306</a> <span> [<a href="https://arxiv.org/pdf/2502.11306">pdf</a>, <a href="https://arxiv.org/format/2502.11306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Smoothing Out Hallucinations: Mitigating LLM Hallucination with Smoothed Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Hieu Nguyen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zihao He</a>, <a href="/search/cs?searchtype=author&query=Gandre%2C+S+A">Shoumik Atul Gandre</a>, <a href="/search/cs?searchtype=author&query=Pasupulety%2C+U">Ujjwal Pasupulety</a>, <a href="/search/cs?searchtype=author&query=Shivakumar%2C+S+K">Sharanya Kumari Shivakumar</a>, <a href="/search/cs?searchtype=author&query=Lerman%2C+K">Kristina Lerman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11306v1-abstract-short" style="display: inline;"> Large language models (LLMs) often suffer from hallucination, generating factually incorrect or ungrounded content, which limits their reliability in high-stakes applications. A key factor contributing to hallucination is the use of hard labels during training, which enforce deterministic supervision, encourage overconfidence, and disregard the uncertainty inherent in natural language. To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11306v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11306v1-abstract-full" style="display: none;"> Large language models (LLMs) often suffer from hallucination, generating factually incorrect or ungrounded content, which limits their reliability in high-stakes applications. A key factor contributing to hallucination is the use of hard labels during training, which enforce deterministic supervision, encourage overconfidence, and disregard the uncertainty inherent in natural language. To address this, we propose mitigating hallucination through knowledge distillation (KD), where a teacher model provides smoothed soft labels to a student model, reducing overconfidence and improving factual grounding. We apply KD during supervised finetuning on instructional data, evaluating its effectiveness across LLMs from different families. Experimental results on summarization benchmarks demonstrate that KD reduces hallucination compared to standard finetuning while preserving performance on general NLP tasks. These findings highlight KD as a promising approach for mitigating hallucination in LLMs and improving model reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11306v1-abstract-full').style.display = 'none'; document.getElementById('2502.11306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11267">arXiv:2502.11267</a> <span> [<a href="https://arxiv.org/pdf/2502.11267">pdf</a>, <a href="https://arxiv.org/format/2502.11267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714319">10.1145/3706598.3714319 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Prompting in the Dark: Assessing Human Performance in Prompt Engineering for Data Labeling When Gold Labels Are Absent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zeyu He</a>, <a href="/search/cs?searchtype=author&query=Naphade%2C+S">Saniya Naphade</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T+%27">Ting-Hao 'Kenneth' Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11267v1-abstract-short" style="display: inline;"> Millions of users prompt large language models (LLMs) for various tasks, but how good are people at prompt engineering? Do users actually get closer to their desired outcome over multiple iterations of their prompts? These questions are crucial when no gold-standard labels are available to measure progress. This paper investigates a scenario in LLM-powered data labeling, "prompting in the dark," w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11267v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11267v1-abstract-full" style="display: none;"> Millions of users prompt large language models (LLMs) for various tasks, but how good are people at prompt engineering? Do users actually get closer to their desired outcome over multiple iterations of their prompts? These questions are crucial when no gold-standard labels are available to measure progress. This paper investigates a scenario in LLM-powered data labeling, "prompting in the dark," where users iteratively prompt LLMs to label data without using manually-labeled benchmarks. We developed PromptingSheet, a Google Sheets add-on that enables users to compose, revise, and iteratively label data through spreadsheets. Through a study with 20 participants, we found that prompting in the dark was highly unreliable-only 9 participants improved labeling accuracy after four or more iterations. Automated prompt optimization tools like DSPy also struggled when few gold labels were available. Our findings highlight the importance of gold labels and the needs, as well as the risks, of automated support in human prompt engineering, providing insights for future tool design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11267v1-abstract-full').style.display = 'none'; document.getElementById('2502.11267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted By CHI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11124">arXiv:2502.11124</a> <span> [<a href="https://arxiv.org/pdf/2502.11124">pdf</a>, <a href="https://arxiv.org/format/2502.11124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AdaManip: Adaptive Articulated Object Manipulation Environments and Policy Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuanfei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaojie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruihai Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yan Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingdong Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhaofeng He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11124v1-abstract-short" style="display: inline;"> Articulated object manipulation is a critical capability for robots to perform various tasks in real-world scenarios. Composed of multiple parts connected by joints, articulated objects are endowed with diverse functional mechanisms through complex relative motions. For example, a safe consists of a door, a handle, and a lock, where the door can only be opened when the latch is unlocked. The inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11124v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11124v1-abstract-full" style="display: none;"> Articulated object manipulation is a critical capability for robots to perform various tasks in real-world scenarios. Composed of multiple parts connected by joints, articulated objects are endowed with diverse functional mechanisms through complex relative motions. For example, a safe consists of a door, a handle, and a lock, where the door can only be opened when the latch is unlocked. The internal structure, such as the state of a lock or joint angle constraints, cannot be directly observed from visual observation. Consequently, successful manipulation of these objects requires adaptive adjustment based on trial and error rather than a one-time visual inference. However, previous datasets and simulation environments for articulated objects have primarily focused on simple manipulation mechanisms where the complete manipulation process can be inferred from the object's appearance. To enhance the diversity and complexity of adaptive manipulation mechanisms, we build a novel articulated object manipulation environment and equip it with 9 categories of objects. Based on the environment and objects, we further propose an adaptive demonstration collection and 3D visual diffusion-based imitation learning pipeline that learns the adaptive manipulation policy. The effectiveness of our designs and proposed method is validated through both simulation and real-world experiments. Our project page is available at: https://adamanip.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11124v1-abstract-full').style.display = 'none'; document.getElementById('2502.11124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10857">arXiv:2502.10857</a> <span> [<a href="https://arxiv.org/pdf/2502.10857">pdf</a>, <a href="https://arxiv.org/format/2502.10857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Divergent Thoughts toward One Goal: LLM-based Multi-Agent Collaboration System for Electronic Design Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haisheng Zheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuolun He</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10857v1-abstract-short" style="display: inline;"> Recently, with the development of tool-calling capabilities in large language models (LLMs), these models have demonstrated significant potential for automating electronic design automation (EDA) flows by interacting with EDA tool APIs via EDA scripts. However, considering the limited understanding of EDA tools, LLMs face challenges in practical scenarios where diverse interfaces of EDA tools exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10857v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10857v1-abstract-full" style="display: none;"> Recently, with the development of tool-calling capabilities in large language models (LLMs), these models have demonstrated significant potential for automating electronic design automation (EDA) flows by interacting with EDA tool APIs via EDA scripts. However, considering the limited understanding of EDA tools, LLMs face challenges in practical scenarios where diverse interfaces of EDA tools exist across different platforms. Additionally, EDA flow automation often involves intricate, long-chain tool-calling processes, increasing the likelihood of errors in intermediate steps. Any errors will lead to the instability and failure of EDA flow automation. To address these challenges, we introduce EDAid, a multi-agent collaboration system where multiple agents harboring divergent thoughts converge towards a common goal, ensuring reliable and successful EDA flow automation. Specifically, each agent is controlled by ChipLlama models, which are expert LLMs fine-tuned for EDA flow automation. Our experiments demonstrate the state-of-the-art (SOTA) performance of our ChipLlama models and validate the effectiveness of our EDAid in the automation of complex EDA flows, showcasing superior performance compared to single-agent systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10857v1-abstract-full').style.display = 'none'; document.getElementById('2502.10857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09971">arXiv:2502.09971</a> <span> [<a href="https://arxiv.org/pdf/2502.09971">pdf</a>, <a href="https://arxiv.org/format/2502.09971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Conditional Latent Coding with Learnable Synthesized Reference for Deep Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siqi Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinda Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhihai He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09971v1-abstract-short" style="display: inline;"> In this paper, we study how to synthesize a dynamic reference from an external dictionary to perform conditional coding of the input image in the latent domain and how to learn the conditional latent synthesis and coding modules in an end-to-end manner. Our approach begins by constructing a universal image feature dictionary using a multi-stage approach involving modified spatial pyramid pooling,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09971v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09971v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09971v1-abstract-full" style="display: none;"> In this paper, we study how to synthesize a dynamic reference from an external dictionary to perform conditional coding of the input image in the latent domain and how to learn the conditional latent synthesis and coding modules in an end-to-end manner. Our approach begins by constructing a universal image feature dictionary using a multi-stage approach involving modified spatial pyramid pooling, dimension reduction, and multi-scale feature clustering. For each input image, we learn to synthesize a conditioning latent by selecting and synthesizing relevant features from the dictionary, which significantly enhances the model's capability in capturing and exploring image source correlation. This conditional latent synthesis involves a correlation-based feature matching and alignment strategy, comprising a Conditional Latent Matching (CLM) module and a Conditional Latent Synthesis (CLS) module. The synthesized latent is then used to guide the encoding process, allowing for more efficient compression by exploiting the correlation between the input image and the reference dictionary. According to our theoretical analysis, the proposed conditional latent coding (CLC) method is robust to perturbations in the external dictionary samples and the selected conditioning latent, with an error bound that scales logarithmically with the dictionary size, ensuring stability even with large and diverse dictionaries. Experimental results on benchmark datasets show that our new method improves the coding performance by a large margin (up to 1.2 dB) with a very small overhead of approximately 0.5\% bits per pixel. Our code is publicly available at https://github.com/ydchen0806/CLC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09971v1-abstract-full').style.display = 'none'; document.getElementById('2502.09971v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09927">arXiv:2502.09927</a> <span> [<a href="https://arxiv.org/pdf/2502.09927">pdf</a>, <a href="https://arxiv.org/format/2502.09927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Granite Vision: a lightweight, open-source multimodal model for enterprise Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Granite+Vision+Team"> Granite Vision Team</a>, <a href="/search/cs?searchtype=author&query=Karlinsky%2C+L">Leonid Karlinsky</a>, <a href="/search/cs?searchtype=author&query=Arbelle%2C+A">Assaf Arbelle</a>, <a href="/search/cs?searchtype=author&query=Daniels%2C+A">Abraham Daniels</a>, <a href="/search/cs?searchtype=author&query=Nassar%2C+A">Ahmed Nassar</a>, <a href="/search/cs?searchtype=author&query=Alfassi%2C+A">Amit Alfassi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bo Wu</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+E">Eli Schwartz</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+D">Dhiraj Joshi</a>, <a href="/search/cs?searchtype=author&query=Kondic%2C+J">Jovana Kondic</a>, <a href="/search/cs?searchtype=author&query=Shabtay%2C+N">Nimrod Shabtay</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengyuan Li</a>, <a href="/search/cs?searchtype=author&query=Herzig%2C+R">Roei Herzig</a>, <a href="/search/cs?searchtype=author&query=Abedin%2C+S">Shafiq Abedin</a>, <a href="/search/cs?searchtype=author&query=Perek%2C+S">Shaked Perek</a>, <a href="/search/cs?searchtype=author&query=Harary%2C+S">Sivan Harary</a>, <a href="/search/cs?searchtype=author&query=Barzelay%2C+U">Udi Barzelay</a>, <a href="/search/cs?searchtype=author&query=Goldfarb%2C+A+R">Adi Raz Goldfarb</a>, <a href="/search/cs?searchtype=author&query=Oliva%2C+A">Aude Oliva</a>, <a href="/search/cs?searchtype=author&query=Wieles%2C+B">Ben Wieles</a>, <a href="/search/cs?searchtype=author&query=Bhattacharjee%2C+B">Bishwaranjan Bhattacharjee</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Brandon Huang</a>, <a href="/search/cs?searchtype=author&query=Auer%2C+C">Christoph Auer</a>, <a href="/search/cs?searchtype=author&query=Gutfreund%2C+D">Dan Gutfreund</a>, <a href="/search/cs?searchtype=author&query=Beymer%2C+D">David Beymer</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09927v1-abstract-short" style="display: inline;"> We introduce Granite Vision, a lightweight large language model with vision capabilities, specifically designed to excel in enterprise use cases, particularly in visual document understanding. Our model is trained on a comprehensive instruction-following dataset, including document-related tasks, such as content extraction from tables, charts, diagrams, sketches, and infographics, as well as gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09927v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09927v1-abstract-full" style="display: none;"> We introduce Granite Vision, a lightweight large language model with vision capabilities, specifically designed to excel in enterprise use cases, particularly in visual document understanding. Our model is trained on a comprehensive instruction-following dataset, including document-related tasks, such as content extraction from tables, charts, diagrams, sketches, and infographics, as well as general image tasks. The architecture of Granite Vision is centered around visual modality alignment with a decoder-only, 2 billion parameter Granite large language model. Additionally, we introduce a dedicated safety classification approach in test-time that leverages a sparse set of attention vectors to identify potential harmful inputs. Despite its lightweight architecture, Granite Vision achieves strong results in standard benchmarks related to visual document understanding, as well as on the LiveXiv benchmark, which is designed to avoid test set contamination by using a constantly updated corpus of recently published Arxiv papers. We are releasing the model under the Apache-2 license, allowing for both research and commercial use, while offering complete visibility into the training data and other relevant details. See https://huggingface.co/ibm-granite/ for model weights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09927v1-abstract-full').style.display = 'none'; document.getElementById('2502.09927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08807">arXiv:2502.08807</a> <span> [<a href="https://arxiv.org/pdf/2502.08807">pdf</a>, <a href="https://arxiv.org/format/2502.08807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InTAR: Inter-Task Auto-Reconfigurable Accelerator Design for High Data Volume Variation in DNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zifan He</a>, <a href="/search/cs?searchtype=author&query=Truong%2C+A">Anderson Truong</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yingqi Cao</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+J">Jason Cong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08807v1-abstract-short" style="display: inline;"> The rise of deep neural networks (DNNs) has driven a boom in AI services, which results in an increased demand for computing power and memory. In modern DNNs, the data sizes produced and consumed are highly varied across operations (high data volume variation, HDV). Because existing design paradigms use fixed execution patterns that lead to either low computational efficiency due to pipeline stall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08807v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08807v1-abstract-full" style="display: none;"> The rise of deep neural networks (DNNs) has driven a boom in AI services, which results in an increased demand for computing power and memory. In modern DNNs, the data sizes produced and consumed are highly varied across operations (high data volume variation, HDV). Because existing design paradigms use fixed execution patterns that lead to either low computational efficiency due to pipeline stalls or frequent off-chip memory accesses to manage large intermediate data, HDV applications are challenging to accelerate on FPGAs. To address these challenges, we introduce the Inter-Task Auto-Reconfigurable Accelerator (InTAR), a novel accelerator design for HDV applications on FPGAs. InTAR combines the high computational efficiency of sequential execution with the reduced off-chip memory overhead of dataflow execution. It switches execution patterns automatically with a static schedule determined before circuit design based on resource constraints and model parameters. Unlike previous reconfigurable accelerators, InTAR encodes reconfiguration schedules during circuit design, allowing model-specific optimizations that allocate only the necessary logic and interconnects. Thus, InTAR achieves a high clock frequency with fewer resources and low reconfiguration time. Furthermore, InTAR supports high-level tools such as HLS for fast design generation. We implement a set of multi-task kernels in various HDV DNNs using InTAR. Compared with dataflow and sequential accelerators, InTAR exhibits $1.8\times$ and $7.1 \times$ speedups correspondingly. We also implement InTAR for GPT-2 medium as a more complex example, which achieves a speedup of $\mathbf{3.65 \sim 39.14\times}$ and a $\mathbf{1.72 \sim 10.44\times}$ boost in DSP efficiency compared to the corresponding SoTA accelerators on FPGAs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08807v1-abstract-full').style.display = 'none'; document.getElementById('2502.08807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08482">arXiv:2502.08482</a> <span> [<a href="https://arxiv.org/pdf/2502.08482">pdf</a>, <a href="https://arxiv.org/format/2502.08482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Auto-regressive Chain-of-Thought through Loop-Aligned Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenyu He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijie Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingjing Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Di He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08482v1-abstract-short" style="display: inline;"> Chain-of-Thought (CoT) prompting has emerged as a powerful technique for enhancing language model's reasoning capabilities. However, generating long and correct CoT trajectories is challenging. Recent studies have demonstrated that Looped Transformers possess remarkable length generalization capabilities, but their limited generality and adaptability prevent them from serving as an alternative to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08482v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08482v1-abstract-full" style="display: none;"> Chain-of-Thought (CoT) prompting has emerged as a powerful technique for enhancing language model's reasoning capabilities. However, generating long and correct CoT trajectories is challenging. Recent studies have demonstrated that Looped Transformers possess remarkable length generalization capabilities, but their limited generality and adaptability prevent them from serving as an alternative to auto-regressive solutions. To better leverage the strengths of Looped Transformers, we propose RELAY (REasoning through Loop Alignment iterativelY). Specifically, we align the steps of Chain-of-Thought (CoT) reasoning with loop iterations and apply intermediate supervision during the training of Looped Transformers. This additional iteration-wise supervision not only preserves the Looped Transformer's ability for length generalization but also enables it to predict CoT reasoning steps for unseen data. Therefore, we leverage this Looped Transformer to generate accurate reasoning chains for complex problems that exceed the training length, which will then be used to fine-tune an auto-regressive model. We conduct extensive experiments, and the results demonstrate the effectiveness of our approach, with significant improvements in the performance of the auto-regressive model. Code will be released at https://github.com/qifanyu/RELAY. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08482v1-abstract-full').style.display = 'none'; document.getElementById('2502.08482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08056">arXiv:2502.08056</a> <span> [<a href="https://arxiv.org/pdf/2502.08056">pdf</a>, <a href="https://arxiv.org/format/2502.08056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Cognify: Supercharging Gen-AI Workflows With Hierarchical Autotuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zijian He</a>, <a href="/search/cs?searchtype=author&query=Abhyankar%2C+R">Reyna Abhyankar</a>, <a href="/search/cs?searchtype=author&query=Srivatsa%2C+V">Vikranth Srivatsa</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiying Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08056v1-abstract-short" style="display: inline;"> Today's gen-AI workflows that involve multiple ML model calls, tool/API calls, data retrieval, or generic code execution are often tuned manually in an ad-hoc way that is both time-consuming and error-prone. In this paper, we propose a systematic approach for automatically tuning gen-AI workflows. Our key insight is that gen-AI workflows can benefit from structure, operator, and prompt changes, bu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08056v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08056v1-abstract-full" style="display: none;"> Today's gen-AI workflows that involve multiple ML model calls, tool/API calls, data retrieval, or generic code execution are often tuned manually in an ad-hoc way that is both time-consuming and error-prone. In this paper, we propose a systematic approach for automatically tuning gen-AI workflows. Our key insight is that gen-AI workflows can benefit from structure, operator, and prompt changes, but unique properties of gen-AI workflows require new optimization techniques. We propose AdaSeek, an adaptive hierarchical search algorithm for autotuning gen-AI workflows. AdaSeek organizes workflow tuning methods into different layers based on the user-specified total search budget and distributes the budget across different layers based on the complexity of each layer. During its hierarchical search, AdaSeek redistributes the search budget from less useful to more promising tuning configurations based on workflow-level evaluation results. We implement AdaSeek in a workflow autotuning framework called Cognify and evaluate Cognify using six types of workflows such as RAG-based QA and text-to-SQL transformation. Overall, Cognify improves these workflows' generation quality by up to 2.8x, reduces execution monetary cost by up to 10x, and reduces end-to-end latency by 2.7x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08056v1-abstract-full').style.display = 'none'; document.getElementById('2502.08056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07802">arXiv:2502.07802</a> <span> [<a href="https://arxiv.org/pdf/2502.07802">pdf</a>, <a href="https://arxiv.org/format/2502.07802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Movie Weaver: Tuning-Free Multi-Concept Video Personalization with Anchored Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+F">Feng Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Haoyu Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zecheng He</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+T">Tingbo Hou</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Ji Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kunpeng Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Azadi%2C+S">Samaneh Azadi</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+A">Animesh Sinha</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peizhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Vajda%2C+P">Peter Vajda</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+D">Diana Marculescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07802v1-abstract-short" style="display: inline;"> Video personalization, which generates customized videos using reference images, has gained significant attention. However, prior methods typically focus on single-concept personalization, limiting broader applications that require multi-concept integration. Attempts to extend these models to multiple concepts often lead to identity blending, which results in composite characters with fused attrib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07802v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07802v1-abstract-full" style="display: none;"> Video personalization, which generates customized videos using reference images, has gained significant attention. However, prior methods typically focus on single-concept personalization, limiting broader applications that require multi-concept integration. Attempts to extend these models to multiple concepts often lead to identity blending, which results in composite characters with fused attributes from multiple sources. This challenge arises due to the lack of a mechanism to link each concept with its specific reference image. We address this with anchored prompts, which embed image anchors as unique tokens within text prompts, guiding accurate referencing during generation. Additionally, we introduce concept embeddings to encode the order of reference images. Our approach, Movie Weaver, seamlessly weaves multiple concepts-including face, body, and animal images-into one video, allowing flexible combinations in a single model. The evaluation shows that Movie Weaver outperforms existing methods for multi-concept video personalization in identity preservation and overall quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07802v1-abstract-full').style.display = 'none'; document.getElementById('2502.07802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jeff-liangf.github.io/projects/movieweaver/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04417">arXiv:2502.04417</a> <span> [<a href="https://arxiv.org/pdf/2502.04417">pdf</a>, <a href="https://arxiv.org/format/2502.04417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NeuralMOVES: A lightweight and microscopic vehicle emission estimation model based on reverse engineering and surrogate learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramirez-Sanchez%2C+E">Edgar Ramirez-Sanchez</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Catherine Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yaosheng Xu</a>, <a href="/search/cs?searchtype=author&query=Renganathan%2C+N">Nrithya Renganathan</a>, <a href="/search/cs?searchtype=author&query=Jayawardana%2C+V">Vindula Jayawardana</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhengbing He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cathy Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04417v1-abstract-short" style="display: inline;"> The transportation sector significantly contributes to greenhouse gas emissions, necessitating accurate emission models to guide mitigation strategies. Despite its field validation and certification, the industry-standard Motor Vehicle Emission Simulator (MOVES) faces challenges related to complexity in usage, high computational demands, and its unsuitability for microscopic real-time applications… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04417v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04417v1-abstract-full" style="display: none;"> The transportation sector significantly contributes to greenhouse gas emissions, necessitating accurate emission models to guide mitigation strategies. Despite its field validation and certification, the industry-standard Motor Vehicle Emission Simulator (MOVES) faces challenges related to complexity in usage, high computational demands, and its unsuitability for microscopic real-time applications. To address these limitations, we present NeuralMOVES, a comprehensive suite of high-performance, lightweight surrogate models for vehicle CO2 emissions. Developed based on reverse engineering and Neural Networks, NeuralMOVES achieves a remarkable 6.013% Mean Average Percentage Error relative to MOVES across extensive tests spanning over two million scenarios with diverse trajectories and the factors regarding environments and vehicles. NeuralMOVES is only 2.4 MB, largely condensing the original MOVES and the reverse engineered MOVES into a compact representation, while maintaining high accuracy. Therefore, NeuralMOVES significantly enhances accessibility while maintaining the accuracy of MOVES, simplifying CO2 evaluation for transportation analyses and enabling real-time, microscopic applications across diverse scenarios without reliance on complex software or extensive computational resources. Moreover, this paper provides, for the first time, a framework for reverse engineering industrial-grade software tailored specifically to transportation scenarios, going beyond MOVES. The surrogate models are available at https://github.com/edgar-rs/neuralMOVES. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04417v1-abstract-full').style.display = 'none'; document.getElementById('2502.04417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04277">arXiv:2502.04277</a> <span> [<a href="https://arxiv.org/pdf/2502.04277">pdf</a>, <a href="https://arxiv.org/format/2502.04277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Non-Variational Quantum Random Access Optimization with Alternating Operator Ansatz </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zichang He</a>, <a href="/search/cs?searchtype=author&query=Raymond%2C+R">Rudy Raymond</a>, <a href="/search/cs?searchtype=author&query=Shaydulin%2C+R">Ruslan Shaydulin</a>, <a href="/search/cs?searchtype=author&query=Pistoia%2C+M">Marco Pistoia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04277v1-abstract-short" style="display: inline;"> Solving hard optimization problems is one of the most promising application domains for quantum computers due to the ubiquity of such problems in industry and the availability of broadly applicable quantum speedups. However, the ability of near-term quantum computers to tackle industrial-scale optimization problems is limited by their size and the overheads of quantum error correction. Quantum Ran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04277v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04277v1-abstract-full" style="display: none;"> Solving hard optimization problems is one of the most promising application domains for quantum computers due to the ubiquity of such problems in industry and the availability of broadly applicable quantum speedups. However, the ability of near-term quantum computers to tackle industrial-scale optimization problems is limited by their size and the overheads of quantum error correction. Quantum Random Access Optimization (QRAO) has been proposed to reduce the space requirements of quantum optimization. However, to date QRAO has only been implemented using variational algorithms, which suffer from the need to train instance-specific variational parameters, making them difficult to scale. We propose and benchmark a non-variational approach to QRAO based on the Quantum Alternating Operator Ansatz (QAOA) for the MaxCut problem. We show that instance-independent ``fixed'' parameters achieve good performance, removing the need for variational parameter optimization. Additionally, we evaluate different design choices, such as various mixers and initial states, as well as QAOA operator implementations when customizing for QRAO, and identify a strategy that performs well in practice. Our results pave the way for the practical execution of QRAO on early fault-tolerant quantum computers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04277v1-abstract-full').style.display = 'none'; document.getElementById('2502.04277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03835">arXiv:2502.03835</a> <span> [<a href="https://arxiv.org/pdf/2502.03835">pdf</a>, <a href="https://arxiv.org/format/2502.03835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Single-Domain Generalized Object Detection by Balancing Domain Diversity and Invariance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenwei He</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+H">Hongsu Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03835v1-abstract-short" style="display: inline;"> Single-domain generalization for object detection (S-DGOD) aims to transfer knowledge from a single source domain to unseen target domains. In recent years, many models have focused primarily on achieving feature invariance to enhance robustness. However, due to the inherent diversity across domains, an excessive emphasis on invariance can cause the model to overlook the actual differences between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03835v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03835v1-abstract-full" style="display: none;"> Single-domain generalization for object detection (S-DGOD) aims to transfer knowledge from a single source domain to unseen target domains. In recent years, many models have focused primarily on achieving feature invariance to enhance robustness. However, due to the inherent diversity across domains, an excessive emphasis on invariance can cause the model to overlook the actual differences between images. This overemphasis may complicate the training process and lead to a loss of valuable information. To address this issue, we propose the Diversity Invariance Detection Model (DIDM), which focuses on the balance between the diversity of domain-specific and invariance cross domains. Recognizing that domain diversity introduces variations in domain-specific features, we introduce a Diversity Learning Module (DLM). The DLM is designed to preserve the diversity of domain-specific information with proposed feature diversity loss while limiting the category semantics in the features. In addition, to maintain domain invariance, we incorporate a Weighted Aligning Module (WAM), which aligns features without compromising feature diversity. We conducted our model on five distinct datasets, which have illustrated the superior performance and effectiveness of the proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03835v1-abstract-full').style.display = 'none'; document.getElementById('2502.03835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02459">arXiv:2502.02459</a> <span> [<a href="https://arxiv.org/pdf/2502.02459">pdf</a>, <a href="https://arxiv.org/format/2502.02459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Computing with Smart Rings: A Systematic Literature Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruotong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jiexin Ding</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiankai Tang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jun Fang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhe He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuojun Li</a>, <a href="/search/cs?searchtype=author&query=R%C3%B6ddiger%2C+T">Tobias R枚ddiger</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiye Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiyuxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+h">huan-ang Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+N">Nan Gao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chun Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanchun Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuntao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02459v1-abstract-short" style="display: inline;"> A smart ring is a wearable electronic device in the form of a ring that incorporates diverse sensors and computing technologies to perform a variety of functions. Designed for use with fingers, smart rings are capable of sensing more subtle and abundant hand movements, thus making them a good platform for interaction. Meanwhile, fingers are abundant with blood vessels and nerve endings and accusto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02459v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02459v1-abstract-full" style="display: none;"> A smart ring is a wearable electronic device in the form of a ring that incorporates diverse sensors and computing technologies to perform a variety of functions. Designed for use with fingers, smart rings are capable of sensing more subtle and abundant hand movements, thus making them a good platform for interaction. Meanwhile, fingers are abundant with blood vessels and nerve endings and accustomed to wearing rings, providing an ideal site for continuous health monitoring through smart rings, which combine comfort with the ability to capture vital biometric data, making them suitable for all-day wear. We collected in total of 206 smart ring-related publications and conducted a systematic literature review. We provide a taxonomy regarding the sensing and feedback modalities, applications, and phenomena. We review and categorize these literatures into four main areas: (1) interaction - input, (2) interaction - output, (3) passive sensing - in body feature, (4) passive sensing - out body activity. This comprehensive review highlights the current advancements within the field of smart ring and identifies potential areas for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02459v1-abstract-full').style.display = 'none'; document.getElementById('2502.02459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01949">arXiv:2502.01949</a> <span> [<a href="https://arxiv.org/pdf/2502.01949">pdf</a>, <a href="https://arxiv.org/format/2502.01949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> LAYOUTDREAMER: Physics-guided Layout for Text-to-3D Compositional Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongjin He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qixuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01949v1-abstract-short" style="display: inline;"> Recently, the field of text-guided 3D scene generation has garnered significant attention. High-quality generation that aligns with physical realism and high controllability is crucial for practical 3D scene applications. However, existing methods face fundamental limitations: (i) difficulty capturing complex relationships between multiple objects described in the text, (ii) inability to generate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01949v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01949v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01949v1-abstract-full" style="display: none;"> Recently, the field of text-guided 3D scene generation has garnered significant attention. High-quality generation that aligns with physical realism and high controllability is crucial for practical 3D scene applications. However, existing methods face fundamental limitations: (i) difficulty capturing complex relationships between multiple objects described in the text, (ii) inability to generate physically plausible scene layouts, and (iii) lack of controllability and extensibility in compositional scenes. In this paper, we introduce LayoutDreamer, a framework that leverages 3D Gaussian Splatting (3DGS) to facilitate high-quality, physically consistent compositional scene generation guided by text. Specifically, given a text prompt, we convert it into a directed scene graph and adaptively adjust the density and layout of the initial compositional 3D Gaussians. Subsequently, dynamic camera adjustments are made based on the training focal point to ensure entity-level generation quality. Finally, by extracting directed dependencies from the scene graph, we tailor physical and layout energy to ensure both realism and flexibility. Comprehensive experiments demonstrate that LayoutDreamer outperforms other compositional scene generation quality and semantic alignment methods. Specifically, it achieves state-of-the-art (SOTA) performance in the multiple objects generation metric of T3Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01949v1-abstract-full').style.display = 'none'; document.getElementById('2502.01949v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01004">arXiv:2502.01004</a> <span> [<a href="https://arxiv.org/pdf/2502.01004">pdf</a>, <a href="https://arxiv.org/format/2502.01004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZeroBP: Learning Position-Aware Correspondence for Zero-shot 6D Pose Estimation in Bin-Picking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianqiu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zikun Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Ye Zheng</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+T">Tianpeng Bao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenyu He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01004v1-abstract-short" style="display: inline;"> Bin-picking is a practical and challenging robotic manipulation task, where accurate 6D pose estimation plays a pivotal role. The workpieces in bin-picking are typically textureless and randomly stacked in a bin, which poses a significant challenge to 6D pose estimation. Existing solutions are typically learning-based methods, which require object-specific training. Their efficiency of practical d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01004v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01004v1-abstract-full" style="display: none;"> Bin-picking is a practical and challenging robotic manipulation task, where accurate 6D pose estimation plays a pivotal role. The workpieces in bin-picking are typically textureless and randomly stacked in a bin, which poses a significant challenge to 6D pose estimation. Existing solutions are typically learning-based methods, which require object-specific training. Their efficiency of practical deployment for novel workpieces is highly limited by data collection and model retraining. Zero-shot 6D pose estimation is a potential approach to address the issue of deployment efficiency. Nevertheless, existing zero-shot 6D pose estimation methods are designed to leverage feature matching to establish point-to-point correspondences for pose estimation, which is less effective for workpieces with textureless appearances and ambiguous local regions. In this paper, we propose ZeroBP, a zero-shot pose estimation framework designed specifically for the bin-picking task. ZeroBP learns Position-Aware Correspondence (PAC) between the scene instance and its CAD model, leveraging both local features and global positions to resolve the mismatch issue caused by ambiguous regions with similar shapes and appearances. Extensive experiments on the ROBI dataset demonstrate that ZeroBP outperforms state-of-the-art zero-shot pose estimation methods, achieving an improvement of 9.1% in average recall of correct poses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01004v1-abstract-full').style.display = 'none'; document.getElementById('2502.01004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00708">arXiv:2502.00708</a> <span> [<a href="https://arxiv.org/pdf/2502.00708">pdf</a>, <a href="https://arxiv.org/format/2502.00708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PhiP-G: Physics-Guided Text-to-3D Compositional Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qixuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongjin He</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yan Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00708v1-abstract-short" style="display: inline;"> Text-to-3D asset generation has achieved significant optimization under the supervision of 2D diffusion priors. However, when dealing with compositional scenes, existing methods encounter several challenges: 1). failure to ensure that composite scene layouts comply with physical laws; 2). difficulty in accurately capturing the assets and relationships described in complex scene descriptions; 3). l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00708v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00708v1-abstract-full" style="display: none;"> Text-to-3D asset generation has achieved significant optimization under the supervision of 2D diffusion priors. However, when dealing with compositional scenes, existing methods encounter several challenges: 1). failure to ensure that composite scene layouts comply with physical laws; 2). difficulty in accurately capturing the assets and relationships described in complex scene descriptions; 3). limited autonomous asset generation capabilities among layout approaches leveraging large language models (LLMs). To avoid these compromises, we propose a novel framework for compositional scene generation, PhiP-G, which seamlessly integrates generation techniques with layout guidance based on a world model. Leveraging LLM-based agents, PhiP-G analyzes the complex scene description to generate a scene graph, and integrating a multimodal 2D generation agent and a 3D Gaussian generation method for targeted assets creation. For the stage of layout, PhiP-G employs a physical pool with adhesion capabilities and a visual supervision agent, forming a world model for layout prediction and planning. Extensive experiments demonstrate that PhiP-G significantly enhances the generation quality and physical rationality of the compositional scenes. Notably, PhiP-G attains state-of-the-art (SOTA) performance in CLIP scores, achieves parity with the leading methods in generation quality as measured by the T$^3$Bench, and improves efficiency by 24x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00708v1-abstract-full').style.display = 'none'; document.getElementById('2502.00708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages.8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00592">arXiv:2502.00592</a> <span> [<a href="https://arxiv.org/pdf/2502.00592">pdf</a>, <a href="https://arxiv.org/format/2502.00592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> M+: Extending MemoryLLM with Scalable Long-Term Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Krotov%2C+D">Dmitry Krotov</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuanzhe Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wangchunshu Zhou</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a>, <a href="/search/cs?searchtype=author&query=Gutfreund%2C+D">Dan Gutfreund</a>, <a href="/search/cs?searchtype=author&query=Feris%2C+R">Rogerio Feris</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zexue He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00592v1-abstract-short" style="display: inline;"> Equipping large language models (LLMs) with latent-space memory has attracted increasing attention as they can extend the context window of existing language models. However, retaining information from the distant past remains a challenge. For example, MemoryLLM (Wang et al., 2024a), as a representative work with latent-space memory, compresses past information into hidden states across all layers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00592v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00592v1-abstract-full" style="display: none;"> Equipping large language models (LLMs) with latent-space memory has attracted increasing attention as they can extend the context window of existing language models. However, retaining information from the distant past remains a challenge. For example, MemoryLLM (Wang et al., 2024a), as a representative work with latent-space memory, compresses past information into hidden states across all layers, forming a memory pool of 1B parameters. While effective for sequence lengths up to 16k tokens, it struggles to retain knowledge beyond 20k tokens. In this work, we address this limitation by introducing M+, a memory-augmented model based on MemoryLLM that significantly enhances long-term information retention. M+ integrates a long-term memory mechanism with a co-trained retriever, dynamically retrieving relevant information during text generation. We evaluate M+ on diverse benchmarks, including long-context understanding and knowledge retention tasks. Experimental results show that M+ significantly outperforms MemoryLLM and recent strong baselines, extending knowledge retention from under 20k to over 160k tokens with similar GPU memory overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00592v1-abstract-full').style.display = 'none'; document.getElementById('2502.00592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00510">arXiv:2502.00510</a> <span> [<a href="https://arxiv.org/pdf/2502.00510">pdf</a>, <a href="https://arxiv.org/format/2502.00510">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Who's the MVP? A Game-Theoretic Evaluation Benchmark for Modular Attribution in LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yingxuan Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bo Huang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Siyuan Qi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chao Feng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haoyi Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuxuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinbo Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoran Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziyi He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongyu Wang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lin Qiu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuezhi Cao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00510v2-abstract-short" style="display: inline;"> Large Language Model (LLM) agents frameworks often employ modular architectures, incorporating components such as planning, reasoning, action execution, and reflection to tackle complex tasks. However, quantifying the contribution of each module to overall system performance remains a significant challenge, impeding optimization and interpretability. To address this, we introduce CapaBench (Capabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00510v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00510v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00510v2-abstract-full" style="display: none;"> Large Language Model (LLM) agents frameworks often employ modular architectures, incorporating components such as planning, reasoning, action execution, and reflection to tackle complex tasks. However, quantifying the contribution of each module to overall system performance remains a significant challenge, impeding optimization and interpretability. To address this, we introduce CapaBench (Capability-level Assessment Benchmark), an evaluation framework grounded in cooperative game theory's Shapley Value, which systematically measures the marginal impact of individual modules and their interactions within an agent's architecture. By replacing default modules with test variants across all possible combinations, CapaBench provides a principle method for attributing performance contributions. Key contributions include: (1) We are the first to propose a Shapley Value-based methodology for quantifying the contributions of capabilities in LLM agents; (2) Modules with high Shapley Values consistently lead to predictable performance gains when combined, enabling targeted optimization; and (3) We build a multi-round dataset of over 1,500 entries spanning diverse domains and practical task scenarios, enabling comprehensive evaluation of agent capabilities. CapaBench bridges the gap between component-level evaluation and holistic system assessment, providing actionable insights for optimizing modular LLM agents and advancing their deployment in complex, real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00510v2-abstract-full').style.display = 'none'; document.getElementById('2502.00510v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18585">arXiv:2501.18585</a> <span> [<a href="https://arxiv.org/pdf/2501.18585">pdf</a>, <a href="https://arxiv.org/format/2501.18585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Thoughts Are All Over the Place: On the Underthinking of o1-Like LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiuzhi Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahao Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+T">Tian Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiwei He</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linfeng Song</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juntao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+H">Haitao Mi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18585v2-abstract-short" style="display: inline;"> Large language models (LLMs) such as OpenAI's o1 have demonstrated remarkable abilities in complex reasoning tasks by scaling test-time compute and exhibiting human-like deep thinking. However, we identify a phenomenon we term underthinking, where o1-like LLMs frequently switch between different reasoning thoughts without sufficiently exploring promising paths to reach a correct solution. This beh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18585v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18585v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18585v2-abstract-full" style="display: none;"> Large language models (LLMs) such as OpenAI's o1 have demonstrated remarkable abilities in complex reasoning tasks by scaling test-time compute and exhibiting human-like deep thinking. However, we identify a phenomenon we term underthinking, where o1-like LLMs frequently switch between different reasoning thoughts without sufficiently exploring promising paths to reach a correct solution. This behavior leads to inadequate depth of reasoning and decreased performance, particularly on challenging mathematical problems. To systematically analyze this issue, we conduct experiments on three challenging test sets and two representative open-source o1-like models, revealing that frequent thought switching correlates with incorrect responses. We introduce a novel metric to quantify underthinking by measuring token efficiency in incorrect answers. To address underthinking, we propose a decoding strategy with thought switching penalty TIP that discourages premature transitions between thoughts, encouraging deeper exploration of each reasoning path. Experimental results demonstrate that our approach improves accuracy across challenging datasets without requiring model fine-tuning. Our findings contribute to understanding reasoning inefficiencies in o1-like LLMs and offer a practical solution to enhance their problem-solving capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18585v2-abstract-full').style.display = 'none'; document.getElementById('2501.18585v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">1. We have updated the results for DeepSeek-R1, and all of our original conclusions remain valid. 2. Our proposed Tip approach remains effective in Best-of-N scenarios (e.g., self-consistency and Laconic Decoding) when built on DeepSeek-R1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15910">arXiv:2501.15910</a> <span> [<a href="https://arxiv.org/pdf/2501.15910">pdf</a>, <a href="https://arxiv.org/ps/2501.15910">ps</a>, <a href="https://arxiv.org/format/2501.15910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Sample Complexity of Online Reinforcement Learning: A Multi-model Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Muehlebach%2C+M">Michael Muehlebach</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiyu He</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+M+I">Michael I. Jordan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15910v1-abstract-short" style="display: inline;"> We study the sample complexity of online reinforcement learning for nonlinear dynamical systems with continuous state and action spaces. Our analysis accommodates a large class of dynamical systems ranging from a finite set of nonlinear candidate models to models with bounded and Lipschitz continuous dynamics, to systems that are parametrized by a compact and real-valued set of parameters. In the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15910v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15910v1-abstract-full" style="display: none;"> We study the sample complexity of online reinforcement learning for nonlinear dynamical systems with continuous state and action spaces. Our analysis accommodates a large class of dynamical systems ranging from a finite set of nonlinear candidate models to models with bounded and Lipschitz continuous dynamics, to systems that are parametrized by a compact and real-valued set of parameters. In the most general setting, our algorithm achieves a policy regret of $\mathcal{O}(N 蔚^2 + \mathrm{ln}(m(蔚))/蔚^2)$, where $N$ is the time horizon, $蔚$ is a user-specified discretization width, and $m(蔚)$ measures the complexity of the function class under consideration via its packing number. In the special case where the dynamics are parametrized by a compact and real-valued set of parameters (such as neural networks, transformers, etc.), we prove a policy regret of $\mathcal{O}(\sqrt{N p})$, where $p$ denotes the number of parameters, recovering earlier sample-complexity results that were derived for linear time-invariant dynamical systems. While this article focuses on characterizing sample complexity, the proposed algorithms are likely to be useful in practice, due to their simplicity, the ability to incorporate prior knowledge, and their benign transient behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15910v1-abstract-full').style.display = 'none'; document.getElementById('2501.15910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15253">arXiv:2501.15253</a> <span> [<a href="https://arxiv.org/pdf/2501.15253">pdf</a>, <a href="https://arxiv.org/format/2501.15253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Deepfake Detection via Effective Local-Global Feature Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiazhen Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziqiang Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziwen He</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhangjie Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15253v1-abstract-short" style="display: inline;"> The rapid advancement of GANs and diffusion models has led to the generation of increasingly realistic fake images, posing significant hidden dangers and threats to society. Consequently, deepfake detection has become a pressing issue in today's world. While some existing methods focus on forgery features from either a local or global perspective, they often overlook the complementary nature of th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15253v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15253v1-abstract-full" style="display: none;"> The rapid advancement of GANs and diffusion models has led to the generation of increasingly realistic fake images, posing significant hidden dangers and threats to society. Consequently, deepfake detection has become a pressing issue in today's world. While some existing methods focus on forgery features from either a local or global perspective, they often overlook the complementary nature of these features. Other approaches attempt to incorporate both local and global features but rely on simplistic strategies, such as cropping, which fail to capture the intricate relationships between local features. To address these limitations, we propose a novel method that effectively combines local spatial-frequency domain features with global frequency domain information, capturing detailed and holistic forgery traces. Specifically, our method uses Discrete Wavelet Transform (DWT) and sliding windows to tile forged features and leverages attention mechanisms to extract local spatial-frequency domain information. Simultaneously, the phase component of the Fast Fourier Transform (FFT) is integrated with attention mechanisms to extract global frequency domain information, complementing the local features and ensuring the integrity of forgery detection. Comprehensive evaluations on open-world datasets generated by 34 distinct generative models demonstrate a significant improvement of 2.9% over existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15253v1-abstract-full').style.display = 'none'; document.getElementById('2501.15253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15085">arXiv:2501.15085</a> <span> [<a href="https://arxiv.org/pdf/2501.15085">pdf</a>, <a href="https://arxiv.org/format/2501.15085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Data Center Cooling System Optimization Using Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+X">Xianyuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiao Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziteng He</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+H">Hanfei Geng</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+J">Jichao Leng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huiwen Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenhui Liu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+T">Tianshun Hong</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yan Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunxin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Feng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15085v2-abstract-short" style="display: inline;"> The recent advances in information technology and artificial intelligence have fueled a rapid expansion of the data center (DC) industry worldwide, accompanied by an immense appetite for electricity to power the DCs. In a typical DC, around 30~40% of the energy is spent on the cooling system rather than on computer servers, posing a pressing need for developing new energy-saving optimization techn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15085v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15085v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15085v2-abstract-full" style="display: none;"> The recent advances in information technology and artificial intelligence have fueled a rapid expansion of the data center (DC) industry worldwide, accompanied by an immense appetite for electricity to power the DCs. In a typical DC, around 30~40% of the energy is spent on the cooling system rather than on computer servers, posing a pressing need for developing new energy-saving optimization technologies for DC cooling systems. However, optimizing such real-world industrial systems faces numerous challenges, including but not limited to a lack of reliable simulation environments, limited historical data, and stringent safety and control robustness requirements. In this work, we present a novel physics-informed offline reinforcement learning (RL) framework for energy efficiency optimization of DC cooling systems. The proposed framework models the complex dynamical patterns and physical dependencies inside a server room using a purposely designed graph neural network architecture that is compliant with the fundamental time-reversal symmetry. Because of its well-behaved and generalizable state-action representations, the model enables sample-efficient and robust latent space offline policy learning using limited real-world operational data. Our framework has been successfully deployed and verified in a large-scale production DC for closed-loop control of its air-cooling units (ACUs). We conducted a total of 2000 hours of short and long-term experiments in the production DC environment. The results show that our method achieves 14~21% energy savings in the DC cooling system, without any violation of the safety or operational constraints. Our results have demonstrated the significant potential of offline RL in solving a broad range of data-limited, safety-critical real-world industrial control problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15085v2-abstract-full').style.display = 'none'; document.getElementById('2501.15085v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12202">arXiv:2501.12202</a> <span> [<a href="https://arxiv.org/pdf/2501.12202">pdf</a>, <a href="https://arxiv.org/format/2501.12202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zeqiang Lai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingxiang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuhui Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yifei Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingxin Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianghui Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huiwen Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sicong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junta Wu</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Y">Yihang Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruining Tang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zebin He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinzhou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xuhui Zuo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+B">Biwen Lei</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+H">Haohan Weng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiling Zhu</a> , et al. (49 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12202v3-abstract-short" style="display: inline;"> We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets. This system includes two foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12202v3-abstract-full').style.display = 'inline'; document.getElementById('2501.12202v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12202v3-abstract-full" style="display: none;"> We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets. This system includes two foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that properly aligns with a given condition image, laying a solid foundation for downstream applications. The texture synthesis model, benefiting from strong geometric and diffusion priors, produces high-resolution and vibrant texture maps for either generated or hand-crafted meshes. Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production platform that simplifies the re-creation process of 3D assets. It allows both professional and amateur users to manipulate or even animate their meshes efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0 outperforms previous state-of-the-art models, including the open-source models and closed-source models in geometry details, condition alignment, texture quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps in the open-source 3D community for large-scale foundation generative models. The code and pre-trained weights of our models are available at: https://github.com/Tencent/Hunyuan3D-2 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12202v3-abstract-full').style.display = 'none'; document.getElementById('2501.12202v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GitHub link: https://github.com/Tencent/Hunyuan3D-2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12175">arXiv:2501.12175</a> <span> [<a href="https://arxiv.org/pdf/2501.12175">pdf</a>, <a href="https://arxiv.org/format/2501.12175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Information Bottleneck Denoised Multimedia Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yonghui Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Le Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhuangzhuang He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhengwei Wu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+R">Richang Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12175v1-abstract-short" style="display: inline;"> Empowered by semantic-rich content information, multimedia recommendation has emerged as a potent personalized technique. Current endeavors center around harnessing multimedia content to refine item representation or uncovering latent item-item structures based on modality similarity. Despite the effectiveness, we posit that these methods are usually suboptimal due to the introduction of irrelevan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12175v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12175v1-abstract-full" style="display: none;"> Empowered by semantic-rich content information, multimedia recommendation has emerged as a potent personalized technique. Current endeavors center around harnessing multimedia content to refine item representation or uncovering latent item-item structures based on modality similarity. Despite the effectiveness, we posit that these methods are usually suboptimal due to the introduction of irrelevant multimedia features into recommendation tasks. This stems from the fact that generic multimedia feature extractors, while well-designed for domain-specific tasks, can inadvertently introduce task-irrelevant features, leading to potential misguidance of recommenders. In this work, we propose a denoised multimedia recommendation paradigm via the Information Bottleneck principle (IB). Specifically, we propose a novel Information Bottleneck denoised Multimedia Recommendation (IBMRec) model to tackle the irrelevant feature issue. IBMRec removes task-irrelevant features from both feature and item-item structure perspectives, which are implemented by two-level IB learning modules: feature-level (FIB) and graph-level (GIB). In particular, FIB focuses on learning the minimal yet sufficient multimedia features. This is achieved by maximizing the mutual information between multimedia representation and recommendation tasks, while concurrently minimizing it between multimedia representation and pre-trained multimedia features. Furthermore, GIB is designed to learn the robust item-item graph structure, it refines the item-item graph based on preference affinity, then minimizes the mutual information between the original graph and the refined one. Extensive experiments across three benchmarks validate the effectiveness of our proposed model, showcasing high performance, and applicability to various multimedia recommenders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12175v1-abstract-full').style.display = 'none'; document.getElementById('2501.12175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11508">arXiv:2501.11508</a> <span> [<a href="https://arxiv.org/pdf/2501.11508">pdf</a>, <a href="https://arxiv.org/format/2501.11508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> See In Detail: Enhancing Sparse-view 3D Gaussian Splatting with Local Depth and Semantic Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongqi He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhe Xiao</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kin-Chung Chan</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yushen Zuo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kin-Man Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11508v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view synthesis. However, its rendering quality deteriorates with sparse inphut views, leading to distorted content and reduced details. This limitation hinders its practical application. To address this issue, we propose a sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view rendering, incorporating pri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11508v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11508v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view synthesis. However, its rendering quality deteriorates with sparse inphut views, leading to distorted content and reduced details. This limitation hinders its practical application. To address this issue, we propose a sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view rendering, incorporating prior information is crucial. We propose a semantic regularization technique, using features extracted from the pretrained DINO-ViT model, to ensure multi-view semantic consistency. Additionally, we propose local depth regularization, which constrains depth values to improve generalization on unseen views. Our method outperforms state-of-the-art novel view synthesis approaches, achieving up to 0.4dB improvement in terms of PSNR on the LLFF dataset, with reduced distortion and enhanced visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11508v1-abstract-full').style.display = 'none'; document.getElementById('2501.11508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, has been accepted by the ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10787">arXiv:2501.10787</a> <span> [<a href="https://arxiv.org/pdf/2501.10787">pdf</a>, <a href="https://arxiv.org/format/2501.10787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LD-DETR: Loop Decoder DEtection TRansformer for Video Moment Retrieval and Highlight Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pengcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhixian He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shujin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10787v1-abstract-short" style="display: inline;"> Video Moment Retrieval and Highlight Detection aim to find corresponding content in the video based on a text query. Existing models usually first use contrastive learning methods to align video and text features, then fuse and extract multimodal information, and finally use a Transformer Decoder to decode multimodal information. However, existing methods face several issues: (1) Overlapping seman… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10787v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10787v1-abstract-full" style="display: none;"> Video Moment Retrieval and Highlight Detection aim to find corresponding content in the video based on a text query. Existing models usually first use contrastive learning methods to align video and text features, then fuse and extract multimodal information, and finally use a Transformer Decoder to decode multimodal information. However, existing methods face several issues: (1) Overlapping semantic information between different samples in the dataset hinders the model's multimodal aligning performance; (2) Existing models are not able to efficiently extract local features of the video; (3) The Transformer Decoder used by the existing model cannot adequately decode multimodal features. To address the above issues, we proposed the LD-DETR model for Video Moment Retrieval and Highlight Detection tasks. Specifically, we first distilled the similarity matrix into the identity matrix to mitigate the impact of overlapping semantic information. Then, we designed a method that enables convolutional layers to extract multimodal local features more efficiently. Finally, we fed the output of the Transformer Decoder back into itself to adequately decode multimodal information. We evaluated LD-DETR on four public benchmarks and conducted extensive experiments to demonstrate the superiority and effectiveness of our approach. Our model outperforms the State-Of-The-Art models on QVHighlight, Charades-STA and TACoS datasets. Our code is available at https://github.com/qingchen239/ld-detr. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10787v1-abstract-full').style.display = 'none'; document.getElementById('2501.10787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07892">arXiv:2501.07892</a> <span> [<a href="https://arxiv.org/pdf/2501.07892">pdf</a>, <a href="https://arxiv.org/format/2501.07892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Metamemory Mechanisms for Enhanced Data-Free Code Generation in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Liang Ding</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yibing Zhan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yong Luo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zheng He</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dapeng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07892v1-abstract-short" style="display: inline;"> Automated code generation using large language models (LLMs) has gained attention due to its efficiency and adaptability. However, real-world coding tasks or benchmarks like HumanEval and StudentEval often lack dedicated training datasets, challenging existing few-shot prompting approaches that rely on reference examples. Inspired by human metamemory-a cognitive process involving recall and evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07892v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07892v1-abstract-full" style="display: none;"> Automated code generation using large language models (LLMs) has gained attention due to its efficiency and adaptability. However, real-world coding tasks or benchmarks like HumanEval and StudentEval often lack dedicated training datasets, challenging existing few-shot prompting approaches that rely on reference examples. Inspired by human metamemory-a cognitive process involving recall and evaluation-we present a novel framework (namely M^2WF) for improving LLMs' one-time code generation. This approach enables LLMs to autonomously generate, evaluate, and utilize synthetic examples to enhance reliability and performance. Unlike prior methods, it minimizes dependency on curated data and adapts flexibly to various coding scenarios. Our experiments demonstrate significant improvements in coding benchmarks, offering a scalable and robust solution for data-free environments. The code and framework will be publicly available on GitHub and HuggingFace. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07892v1-abstract-full').style.display = 'none'; document.getElementById('2501.07892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06921">arXiv:2501.06921</a> <span> [<a href="https://arxiv.org/pdf/2501.06921">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Monolithic 3D FPGAs Utilizing Back-End-of-Line Configuration Memories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Waqar%2C+F">Faaiq Waqar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiahao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+A">Anni Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zifan He</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+J">Jason Cong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shimeng Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06921v1-abstract-short" style="display: inline;"> This work presents a novel monolithic 3D (M3D) FPGA architecture that leverages stackable back-end-of-line (BEOL) transistors to implement configuration memory and pass gates, significantly improving area, latency, and power efficiency. By integrating n-type (W-doped In_2O_3) and p-type (SnO) amorphous oxide semiconductor (AOS) transistors in the BEOL, Si SRAM configuration bits are substituted wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06921v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06921v1-abstract-full" style="display: none;"> This work presents a novel monolithic 3D (M3D) FPGA architecture that leverages stackable back-end-of-line (BEOL) transistors to implement configuration memory and pass gates, significantly improving area, latency, and power efficiency. By integrating n-type (W-doped In_2O_3) and p-type (SnO) amorphous oxide semiconductor (AOS) transistors in the BEOL, Si SRAM configuration bits are substituted with a less leaky equivalent that can be programmed at logic-compatible voltages. BEOL-compatible AOS transistors are currently under extensive research and development in the device community, with investment by leading foundries, from which reported data is used to develop robust physics-based models in TCAD that enable circuit design. The use of AOS pass gates reduces the overhead of reconfigurable circuits by mapping FPGA switch block (SB) and connection block (CB) matrices above configurable logic blocks (CLBs), thereby increasing the proximity of logic elements and reducing latency. By interfacing with the latest Verilog-to-Routing (VTR) suite, an AOS-based M3D FPGA design implemented in 7 nm technology is demonstrated with 3.4x lower area-time squared product (AT^2), 27% lower critical path latency, and 26% lower reconfigurable routing block power on benchmarks including hyperdimensional computing and large language models (LLMs). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06921v1-abstract-full').style.display = 'none'; document.getElementById('2501.06921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 Pages, 9 Figures, 3 Tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> B.3.1; B.7.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05901">arXiv:2501.05901</a> <span> [<a href="https://arxiv.org/pdf/2501.05901">pdf</a>, <a href="https://arxiv.org/format/2501.05901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Valley2: Exploring Multimodal Models with Scalable Vision-Language Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziheng Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruipu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Can Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhentao He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xian Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haoran Lin</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Minghui Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05901v2-abstract-short" style="display: inline;"> Recently, vision-language models have made remarkable progress, demonstrating outstanding capabilities in various tasks such as image captioning and video understanding. We introduce Valley2, a novel multimodal large language model designed to enhance performance across all domains and extend the boundaries of practical applications in e-commerce and short video scenarios. Notably, Valley2 achieve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05901v2-abstract-full').style.display = 'inline'; document.getElementById('2501.05901v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05901v2-abstract-full" style="display: none;"> Recently, vision-language models have made remarkable progress, demonstrating outstanding capabilities in various tasks such as image captioning and video understanding. We introduce Valley2, a novel multimodal large language model designed to enhance performance across all domains and extend the boundaries of practical applications in e-commerce and short video scenarios. Notably, Valley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks, surpassing open-source models of similar size by a large margin (79.66 vs. 72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among models with fewer than 10B parameters, with an impressive average score of 67.4. The code and model weights are open-sourced at https://github.com/bytedance/Valley. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05901v2-abstract-full').style.display = 'none'; document.getElementById('2501.05901v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05006">arXiv:2501.05006</a> <span> [<a href="https://arxiv.org/pdf/2501.05006">pdf</a>, <a href="https://arxiv.org/format/2501.05006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> CHASE: A Native Relational Database for Hybrid Queries on Structured and Unstructured Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+R">Rui Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenying He</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Y">Yinan Jing</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+S">X. Sean Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenqiang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05006v1-abstract-short" style="display: inline;"> Querying both structured and unstructured data has become a new paradigm in data analytics and recommendation. With unstructured data, such as text and videos, are converted to high-dimensional vectors and queried with approximate nearest neighbor search (ANNS). State-of-the-art database systems implement vector search as a plugin in the relational query engine, which tries to utilize the ANN inde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05006v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05006v1-abstract-full" style="display: none;"> Querying both structured and unstructured data has become a new paradigm in data analytics and recommendation. With unstructured data, such as text and videos, are converted to high-dimensional vectors and queried with approximate nearest neighbor search (ANNS). State-of-the-art database systems implement vector search as a plugin in the relational query engine, which tries to utilize the ANN index to enhance performance. After investigating a broad range of hybrid queries, we find that such designs may miss potential optimization opportunities and achieve suboptimal performance for certain queries. In this paper, we propose CHASE, a query engine that is natively designed to support efficient hybrid queries on structured and unstructured data. CHASE performs specific designs and optimizations on multiple stages in query processing. First, semantic analysis is performed to categorize queries and optimize query plans dynamically. Second, new physical operators are implemented to avoid redundant computations, which is the case with existing operators. Third, compilation-based techniques are adopted for efficient machine code generation. Extensive evaluations using real-world datasets demonstrate that CHASE achieves substantial performance improvements, with speedups ranging from 13% to an extraordinary 7500 times compared to existing systems. These results highlight CHASE's potential as a robust solution for executing hybrid queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05006v1-abstract-full').style.display = 'none'; document.getElementById('2501.05006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04987">arXiv:2501.04987</a> <span> [<a href="https://arxiv.org/pdf/2501.04987">pdf</a>, <a href="https://arxiv.org/format/2501.04987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TreeKV: Smooth Key-Value Cache Compression with Tree Structures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziwei He</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jian Yuan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+H">Haoli Bai</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+J">Jingwen Leng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04987v2-abstract-short" style="display: inline;"> Efficient key-value (KV) cache compression is critical for scaling transformer-based Large Language Models (LLMs) in long sequences and resource-limited settings. Existing methods evict tokens based on their positions or importance scores, but position-based strategies can miss crucial information outside predefined regions, while those relying on global importance scores resulting in strong regio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04987v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04987v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04987v2-abstract-full" style="display: none;"> Efficient key-value (KV) cache compression is critical for scaling transformer-based Large Language Models (LLMs) in long sequences and resource-limited settings. Existing methods evict tokens based on their positions or importance scores, but position-based strategies can miss crucial information outside predefined regions, while those relying on global importance scores resulting in strong regional biases, limiting the KV cache's overall context retention and potentially impairing the performance of LLMs on complex tasks. Our wavelet analysis reveals that as tokens approach the end of sequence, their contributions to generation gradually increase and tends to diverge more from neighboring tokens, indicating a smooth transition with increasing complexity and variability from distant to nearby context. Motivated by this observation, we propose TreeKV, an intuitive, training-free method that employs a tree structure for smooth cache compression. TreeKV maintains a fixed cache size, allowing LLMs to deliver high-quality output even in long text scenarios. Unlike most compression methods, TreeKV is applicable to both the generation and prefilling stages. TreeKV consistently surpasses all baseline models in language modeling tasks on PG19 and OpenWebText2, allowing LLMs trained with short context window to generalize to longer window with a 16x cache reduction. On the Longbench benchmark, TreeKV achieves the best performance with only 6\% of the budget at optimal efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04987v2-abstract-full').style.display = 'none'; document.getElementById('2501.04987v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04908">arXiv:2501.04908</a> <span> [<a href="https://arxiv.org/pdf/2501.04908">pdf</a>, <a href="https://arxiv.org/format/2501.04908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> HaVen: Hallucination-Mitigated LLM for Verilog Code Generation Aligned with HDL Engineers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiyao Yang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+F">Fu Teng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengju Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengnan Qi</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chenyang Lv</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Ji Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhezhi He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04908v1-abstract-short" style="display: inline;"> Recently, the use of large language models (LLMs) for Verilog code generation has attracted great research interest to enable hardware design automation. However, previous works have shown a gap between the ability of LLMs and the practical demands of hardware description language (HDL) engineering. This gap includes differences in how engineers phrase questions and hallucinations in the code gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04908v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04908v1-abstract-full" style="display: none;"> Recently, the use of large language models (LLMs) for Verilog code generation has attracted great research interest to enable hardware design automation. However, previous works have shown a gap between the ability of LLMs and the practical demands of hardware description language (HDL) engineering. This gap includes differences in how engineers phrase questions and hallucinations in the code generated. To address these challenges, we introduce HaVen, a novel LLM framework designed to mitigate hallucinations and align Verilog code generation with the practices of HDL engineers. HaVen tackles hallucination issues by proposing a comprehensive taxonomy and employing a chain-of-thought (CoT) mechanism to translate symbolic modalities (e.g. truth tables, state diagrams, etc.) into accurate natural language descriptions. Furthermore, HaVen bridges this gap by using a data augmentation strategy. It synthesizes high-quality instruction-code pairs that match real HDL engineering practices. Our experiments demonstrate that HaVen significantly improves the correctness of Verilog code generation, outperforming state-of-the-art LLM-based Verilog generation methods on VerilogEval and RTLLM benchmark. HaVen is publicly available at https://github.com/Intelligent-Computing-Research-Group/HaVen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04908v1-abstract-full').style.display = 'none'; document.getElementById('2501.04908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=He%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository