Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 550 results for author: <span class="mathjax">Yan, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/" aria-role="search"> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yan, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yan%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yan, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09707">arXiv:2502.09707</a> <span> [<a href="https://arxiv.org/pdf/2502.09707">pdf</a>, <a href="https://arxiv.org/format/2502.09707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> </div> </div> <p class="title is-5 mathjax"> SDSS-IV MaStar: Quantification and Abatement of Interstellar Absorption in the Largest Empirical Stellar Spectral Library </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Rubin%2C+K+H+R">Kate H. R. Rubin</a>, <a href="/search/?searchtype=author&query=Westfall%2C+K+B">Kyle B. Westfall</a>, <a href="/search/?searchtype=author&query=Maraston%2C+C">Claudia Maraston</a>, <a href="/search/?searchtype=author&query=Thomas%2C+D">Daniel Thomas</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Renbin Yan</a>, <a href="/search/?searchtype=author&query=Howk%2C+J+C">J. Christopher Howk</a>, <a href="/search/?searchtype=author&query=Aguirre%2C+E">Erick Aguirre</a>, <a href="/search/?searchtype=author&query=Parker%2C+K+S">Kaelee S. Parker</a>, <a href="/search/?searchtype=author&query=Law%2C+D+R">David R. Law</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09707v1-abstract-short" style="display: inline;"> We assess the impact of CaII 3934,3969 and NaI 5891,5897 absorption arising in the interstellar medium (ISM) on the SDSS-IV MaNGA Stellar Library (MaStar) and produce corrected spectroscopy for 80% of the 24,162-star catalog. We model the absorption strength of these transitions as a function of stellar distance, Galactic latitude, and dust reddening based upon high-spectral resolution studies. Wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09707v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09707v1-abstract-full" style="display: none;"> We assess the impact of CaII 3934,3969 and NaI 5891,5897 absorption arising in the interstellar medium (ISM) on the SDSS-IV MaNGA Stellar Library (MaStar) and produce corrected spectroscopy for 80% of the 24,162-star catalog. We model the absorption strength of these transitions as a function of stellar distance, Galactic latitude, and dust reddening based upon high-spectral resolution studies. With this model, we identify 6342 MaStar stars that have negligible ISM absorption ($W^\mathrm{ISM}$(CaII K) $<0.07$ Ang and $W^\mathrm{ISM}$(NaI 5891) $<0.05$ Ang). For 12,110 of the remaining stars, we replace their NaI D profile (and their CaII profile for effective temperatures $T_{\rm eff}>9000$ K) with a coadded spectrum of low-ISM stars with similar $T_{\rm eff}$, surface gravity, and metallicity. For 738 additional stars with $T_{\rm eff}>9000$ K, we replace these spectral regions with a matching ATLAS9-based BOSZ model. This results in a mean reduction in $W$(CaII K) ($W$(NaI D)) of $0.4-0.7$ Ang ($0.6-1.1$ Ang) for hot stars ($T_{\rm eff}>7610$ K), and a mean reduction in $W$(NaI D) of $0.1-0.2$ Ang for cooler stars. We show that interstellar absorption in simple stellar population (SSP) model spectra constructed from the original library artificially enhances $W$(CaII K) by $\gtrsim20\%$ at young ages ($<400$ Myr); dramatically enhances the strength of stellar NaI D in starbursting systems (by ${\gtrsim}50\%$); and enhances stellar NaI D in older stellar populations (${\gtrsim}10$ Gyr) by ${\gtrsim}10\%$. We provide SSP spectra constructed from the cleaned library, and discuss the implications of these effects for stellar population synthesis analyses constraining stellar age, [Na/Fe] abundance, and the initial mass function. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09707v1-abstract-full').style.display = 'none'; document.getElementById('2502.09707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">45 pages, 25 figures, 2 appendices. Accepted to ApJ. Cleaned MaStar stellar library spectra are available at https://doi.org/10.5281/zenodo.14014915 . SSP spectra constructed from the cleaned library are available at https://doi.org/10.5281/zenodo.14807331 . A subset are available for use with the MaNGA DAP at https://github.com/sdss/mangadap/tree/4.2.0/mangadap/data/spectral_templates</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07903">arXiv:2502.07903</a> <span> [<a href="https://arxiv.org/pdf/2502.07903">pdf</a>, <a href="https://arxiv.org/format/2502.07903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> HexGen-2: Disaggregated Generative Inference of LLMs in Heterogeneous Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Jiang%2C+Y">Youhe Jiang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ran Yan</a>, <a href="/search/?searchtype=author&query=Yuan%2C+B">Binhang Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07903v1-abstract-short" style="display: inline;"> Disaggregating the prefill and decoding phases represents an effective new paradigm for generative inference of large language models (LLM), which eliminates prefill-decoding interference and optimizes resource allocation. However, it is still an open problem about how to deploy the disaggregated inference paradigm across a group of heterogeneous GPUs, which can be an economical alternative to dep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07903v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07903v1-abstract-full" style="display: none;"> Disaggregating the prefill and decoding phases represents an effective new paradigm for generative inference of large language models (LLM), which eliminates prefill-decoding interference and optimizes resource allocation. However, it is still an open problem about how to deploy the disaggregated inference paradigm across a group of heterogeneous GPUs, which can be an economical alternative to deployment over homogeneous high-performance GPUs. Towards this end, we introduce HexGen-2, a distributed system for efficient and economical LLM serving on heterogeneous GPUs following the disaggregated paradigm. Built on top of HexGen, the core component of HexGen-2 is a scheduling algorithm that formalizes the allocation of disaggregated LLM inference computations and communications over heterogeneous GPUs and network connections as a constraint optimization problem. We leverage the graph partitioning and max-flow algorithms to co-optimize resource allocation, parallel strategies for distinct inference phases, and the efficiency of inter-phase key-value (KV) cache communications. We conduct extensive experiments to evaluate HexGen-2, i.e., on OPT (30B) and Llama-2 (70B) models in various real-world settings, the results reveal that HexGen-2 delivers up to a 2.0 times and on average a 1.3 times improvement in serving throughput, reduces the average inference latency by 1.5 times compared with state-of-the-art systems given the same price budget, and achieves comparable inference performance with a 30% lower price budget. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07903v1-abstract-full').style.display = 'none'; document.getElementById('2502.07903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07555">arXiv:2502.07555</a> <span> [<a href="https://arxiv.org/pdf/2502.07555">pdf</a>, <a href="https://arxiv.org/format/2502.07555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> O1 Embedder: Let Retrievers Think Before Action </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Ruiran Yan</a>, <a href="/search/?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/?searchtype=author&query=Lian%2C+D">Defu Lian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07555v2-abstract-short" style="display: inline;"> The growing power of large language models (LLMs) has revolutionized how people access and utilize information. Notably, the LLMs excel at performing fine-grained data representation, which facilitates precise retrieval of information. They also generate high-quality answers based on external references, enabling the production of useful knowledge. The recent introduction of reasoning models, like… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07555v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07555v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07555v2-abstract-full" style="display: none;"> The growing power of large language models (LLMs) has revolutionized how people access and utilize information. Notably, the LLMs excel at performing fine-grained data representation, which facilitates precise retrieval of information. They also generate high-quality answers based on external references, enabling the production of useful knowledge. The recent introduction of reasoning models, like OpenAI O1 and DeepSeek R1, marks another leap forward, highlighting LLMs' ability to think progressively before delivering final answers. This breakthrough significantly improves the ability to address complex tasks, e.g., coding and math proofs. Inspired by this progress, we aim to develop similar capabilities for retrieval models, which hold great promise for tackling critical challenges in the field, including multi-task retrieval, zero-shot retrieval, and tasks requiring intensive reasoning of complex relationships. With this motivation, we propose a novel approach called O1 Embedder, which generates useful thoughts for the input query before making retrieval for the target documents. To realize this objective, we conquer two technical difficulties. First, we design a data synthesis workflow, creating training signals for O1 Embedder by generating initial thoughts from an LLM-expert and subsequently refining them using a retrieval committee. Second, we optimize the training process, enabling a pre-trained model to be jointly fine-tuned to generate retrieval thoughts via behavior cloning and perform dense retrieval through contrastive learning. Our approach is evaluated by comprehensive experiments, where substantial improvements are achieved across 12 popular datasets, spanning both in-domain and out-of-domain scenarios. These results highlight O1 Embedder's remarkable accuracy and generalizability, paving the way for the development of next-generation IR foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07555v2-abstract-full').style.display = 'none'; document.getElementById('2502.07555v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01962">arXiv:2502.01962</a> <span> [<a href="https://arxiv.org/pdf/2502.01962">pdf</a>, <a href="https://arxiv.org/format/2502.01962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Memory Efficient Transformer Adapter for Dense Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Dong%2C+P">Pingcheng Dong</a>, <a href="/search/?searchtype=author&query=Cheng%2C+K">Kwang-Ting Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01962v1-abstract-short" style="display: inline;"> While current Vision Transformer (ViT) adapter methods have shown promising accuracy, their inference speed is implicitly hindered by inefficient memory access operations, e.g., standard normalization and frequent reshaping. In this work, we propose META, a simple and fast ViT adapter that can improve the model's memory efficiency and decrease memory time consumption by reducing the inefficient me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01962v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01962v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01962v1-abstract-full" style="display: none;"> While current Vision Transformer (ViT) adapter methods have shown promising accuracy, their inference speed is implicitly hindered by inefficient memory access operations, e.g., standard normalization and frequent reshaping. In this work, we propose META, a simple and fast ViT adapter that can improve the model's memory efficiency and decrease memory time consumption by reducing the inefficient memory access operations. Our method features a memory-efficient adapter block that enables the common sharing of layer normalization between the self-attention and feed-forward network layers, thereby reducing the model's reliance on normalization operations. Within the proposed block, the cross-shaped self-attention is employed to reduce the model's frequent reshaping operations. Moreover, we augment the adapter block with a lightweight convolutional branch that can enhance local inductive biases, particularly beneficial for the dense prediction tasks, e.g., object detection, instance segmentation, and semantic segmentation. The adapter block is finally formulated in a cascaded manner to compute diverse head features, thereby enriching the variety of feature representations. Empirically, extensive evaluations on multiple representative datasets validate that META substantially enhances the predicted quality, while achieving a new state-of-the-art accuracy-efficiency trade-off. Theoretically, we demonstrate that META exhibits superior generalization capability and stronger adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01962v1-abstract-full').style.display = 'none'; document.getElementById('2502.01962v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00791">arXiv:2502.00791</a> <span> [<a href="https://arxiv.org/pdf/2502.00791">pdf</a>, <a href="https://arxiv.org/format/2502.00791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vision-centric Token Compression in Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Xing%2C+L">Ling Xing</a>, <a href="/search/?searchtype=author&query=Wang%2C+A+J">Alex Jinpeng Wang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00791v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have revolutionized natural language processing, excelling in handling longer sequences. However, the inefficiency and redundancy in processing extended in-context tokens remain a challenge. Many attempts to address this rely on compressing tokens with smaller text encoders, yet we question whether text encoders are truly indispensable. Our journey leads to an unexpect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00791v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00791v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00791v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have revolutionized natural language processing, excelling in handling longer sequences. However, the inefficiency and redundancy in processing extended in-context tokens remain a challenge. Many attempts to address this rely on compressing tokens with smaller text encoders, yet we question whether text encoders are truly indispensable. Our journey leads to an unexpected discovery-a much smaller vision encoder, applied directly to sequences of text tokens, can rival text encoders on text tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small text understanding benchmarks, VIST leads to comparable results with 16% fewer FLOPs and 50% less memory usage. We further uncover significant token redundancy and devise a frequency-based masking strategy to guide the focus of the visual encoder toward the most critical tokens. Interestingly, we observe the trained visual encoder performs like a summarizer, selectively ignoring less important words such as prepositions and conjunctions. This approach delivers remarkable results, outperforming traditional text encoder-based methods by 5.7% on average over benchmarks like TriviaQA, NQ, PopQA, TREF, SST2, and SST5, setting a new standard for token efficiency in LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00791v2-abstract-full').style.display = 'none'; document.getElementById('2502.00791v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00527">arXiv:2502.00527</a> <span> [<a href="https://arxiv.org/pdf/2502.00527">pdf</a>, <a href="https://arxiv.org/format/2502.00527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PolarQuant: Leveraging Polar Transformation for Efficient Key Cache Quantization and Decoding Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Wu%2C+S">Songhao Wu</a>, <a href="/search/?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/?searchtype=author&query=Feng%2C+X">Xiao Feng</a>, <a href="/search/?searchtype=author&query=Zhang%2C+Y">Yufei Zhang</a>, <a href="/search/?searchtype=author&query=Zhang%2C+X">Xun Zhang</a>, <a href="/search/?searchtype=author&query=Yin%2C+G">Guojun Yin</a>, <a href="/search/?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00527v1-abstract-short" style="display: inline;"> The KV cache in large language models is a dominant factor in memory usage, limiting their broader applicability. Quantizing the cache to lower bit widths is an effective way to reduce computational costs; however, previous methods struggle with quantizing key vectors due to outliers, resulting in excessive overhead. We propose a novel quantization approach called PolarQuant, which efficiently add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00527v1-abstract-full" style="display: none;"> The KV cache in large language models is a dominant factor in memory usage, limiting their broader applicability. Quantizing the cache to lower bit widths is an effective way to reduce computational costs; however, previous methods struggle with quantizing key vectors due to outliers, resulting in excessive overhead. We propose a novel quantization approach called PolarQuant, which efficiently addresses the outlier challenge. We observe that outliers typically appear in only one of two dimensions, which are rotated together by a specific angle when rotary position embeddings are applied. When represented as two-dimensional vectors, these dimensions exhibit well-structured patterns, with radii and angles smoothly distributed in polar coordinates. This alleviates the challenge of outliers on per-channel quantization, making them well-suited for quantization. Thus, PolarQuant divides key vectors into groups of two-dimensional sub-vectors, encoding them as the corresponding quantized radius and the polar angle, rather than quantizing original key vectors directly. PolarQuant achieves the superior efficiency in KV cache quantization and accelerates the decoding process by turning the query-key inner product into a table lookup, all while maintaining the downstream performance of full-precision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00527v1-abstract-full').style.display = 'none'; document.getElementById('2502.00527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00426">arXiv:2502.00426</a> <span> [<a href="https://arxiv.org/pdf/2502.00426">pdf</a>, <a href="https://arxiv.org/format/2502.00426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TEST-V: TEst-time Support-set Tuning for Zero-shot Video Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/?searchtype=author&query=Qu%2C+H">Hongyu Qu</a>, <a href="/search/?searchtype=author&query=Du%2C+X">Xiaoyu Du</a>, <a href="/search/?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/?searchtype=author&query=Tang%2C+J">Jinhui Tang</a>, <a href="/search/?searchtype=author&query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00426v2-abstract-short" style="display: inline;"> Recently, adapting Vision Language Models (VLMs) to zero-shot visual classification by tuning class embedding with a few prompts (Test-time Prompt Tuning, TPT) or replacing class names with generated visual samples (support-set) has shown promising results. However, TPT cannot avoid the semantic gap between modalities while the support-set cannot be tuned. To this end, we draw on each other's stre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00426v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00426v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00426v2-abstract-full" style="display: none;"> Recently, adapting Vision Language Models (VLMs) to zero-shot visual classification by tuning class embedding with a few prompts (Test-time Prompt Tuning, TPT) or replacing class names with generated visual samples (support-set) has shown promising results. However, TPT cannot avoid the semantic gap between modalities while the support-set cannot be tuned. To this end, we draw on each other's strengths and propose a novel framework namely TEst-time Support-set Tuning for zero-shot Video Classification (TEST-V). It first dilates the support-set with multiple prompts (Multi-prompting Support-set Dilation, MSD) and then erodes the support-set via learnable weights to mine key cues dynamically (Temporal-aware Support-set Erosion, TSE). Specifically, i) MSD expands the support samples for each class based on multiple prompts enquired from LLMs to enrich the diversity of the support-set. ii) TSE tunes the support-set with factorized learnable weights according to the temporal prediction consistency in a self-supervised manner to dig pivotal supporting cues for each class. $\textbf{TEST-V}$ achieves state-of-the-art results across four benchmarks and has good interpretability for the support-set dilation and erosion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00426v2-abstract-full').style.display = 'none'; document.getElementById('2502.00426v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13074">arXiv:2501.13074</a> <span> [<a href="https://arxiv.org/pdf/2501.13074">pdf</a>, <a href="https://arxiv.org/format/2501.13074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Autonomy-of-Experts Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/?searchtype=author&query=Qian%2C+Y">Yining Qian</a>, <a href="/search/?searchtype=author&query=Wu%2C+S">Songhao Wu</a>, <a href="/search/?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13074v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) models mostly use a router to assign tokens to specific expert modules, activating only partial parameters and often outperforming dense models. We argue that the separation between the router's decision-making and the experts' execution is a critical yet overlooked issue, leading to suboptimal expert selection and ineffective learning. To address this, we propose Autonomy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13074v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13074v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) models mostly use a router to assign tokens to specific expert modules, activating only partial parameters and often outperforming dense models. We argue that the separation between the router's decision-making and the experts' execution is a critical yet overlooked issue, leading to suboptimal expert selection and ineffective learning. To address this, we propose Autonomy-of-Experts (AoE), a novel MoE paradigm in which experts autonomously select themselves to process inputs. AoE is based on the insight that an expert is aware of its own capacity to effectively process a token, an awareness reflected in the scale of its internal activations. In AoE, routers are removed; instead, experts pre-compute internal activations for inputs and are ranked based on their activation norms. Only the top-ranking experts proceed with the forward pass, while the others abort. The overhead of pre-computing activations is reduced through a low-rank weight factorization. This self-evaluating-then-partner-comparing approach ensures improved expert selection and effective learning. We pre-train language models having 700M up to 4B parameters, demonstrating that AoE outperforms traditional MoE models with comparable efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13074v1-abstract-full').style.display = 'none'; document.getElementById('2501.13074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12090">arXiv:2501.12090</a> <span> [<a href="https://arxiv.org/pdf/2501.12090">pdf</a>, <a href="https://arxiv.org/format/2501.12090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Evaluation of Four End-to-End AI Autopilots Using CCTest and the Carla Leaderboard </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Li%2C+C">Changwen Li</a>, <a href="/search/?searchtype=author&query=Sifakis%2C+J">Joseph Sifakis</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rongjie Yan</a>, <a href="/search/?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12090v2-abstract-short" style="display: inline;"> Scenario-based testing is currently the dominant simulation-based validation approach for ADS. Its effective application raises two interrelated issues. The first is the choice of the method used to generate scenarios, based on various criteria such as risk, degree of autonomy, degree of coverage and representativeness, and complexity. The other is the choice of the evaluation method for estimatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12090v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12090v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12090v2-abstract-full" style="display: none;"> Scenario-based testing is currently the dominant simulation-based validation approach for ADS. Its effective application raises two interrelated issues. The first is the choice of the method used to generate scenarios, based on various criteria such as risk, degree of autonomy, degree of coverage and representativeness, and complexity. The other is the choice of the evaluation method for estimating the safety and performance of the system under test. This work extends a study of the critical configuration testing (CCTest) approach we have already applied to four open modular autopilots. This approach differs from general scenario-based approaches in that it uses only realistic, potentially safe critical scenarios. It enables an accurate assessment of the ability to drive safely in critical situations for which feasible safety policies exist. Any incident observed in the simulation involves the failure of a tested autopilot. The contribution of this paper is twofold. First, we apply the critical configuration testing approach to four end-to-end open autopilots, Transfuser, InterFuser, MILE and LMDriver, and compare their test results with those of the four modular open autopilots previously tested with the same approach implemented in the Carla simulation environment. This comparison identifies both differences and similarities in the failures of the two autopilot types in critical situations. Secondly, we compare the evaluations of the four autopilots carried out in the Carla Leaderboard with our results obtained by testing critical configurations. This comparison reveals significant discrepancies, reflecting differences in test case generation criteria and risk assessment methods. It underlines the need to work towards the development of objective assessment methods combining qualitative and quantitative criteria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12090v2-abstract-full').style.display = 'none'; document.getElementById('2501.12090v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07034">arXiv:2501.07034</a> <span> [<a href="https://arxiv.org/pdf/2501.07034">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Explore the Use of Time Series Foundation Model for Car-Following Behavior Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zeng%2C+L">Luwei Zeng</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Runze Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07034v1-abstract-short" style="display: inline;"> Modeling car-following behavior is essential for traffic simulation, analyzing driving patterns, and understanding complex traffic flows with varying levels of autonomous vehicles. Traditional models like the Safe Distance Model and Intelligent Driver Model (IDM) require precise parameter calibration and often lack generality due to simplified assumptions about driver behavior. While machine learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07034v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07034v1-abstract-full" style="display: none;"> Modeling car-following behavior is essential for traffic simulation, analyzing driving patterns, and understanding complex traffic flows with varying levels of autonomous vehicles. Traditional models like the Safe Distance Model and Intelligent Driver Model (IDM) require precise parameter calibration and often lack generality due to simplified assumptions about driver behavior. While machine learning and deep learning methods capture complex patterns, they require large labeled datasets. Foundation models provide a more efficient alternative. Pre-trained on vast, diverse time series datasets, they can be applied directly to various tasks without the need for extensive re-training. These models generalize well across domains, and with minimal fine-tuning, they can be adapted to specific tasks like car-following behavior prediction. In this paper, we apply Chronos, a state-of-the-art public time series foundation model, to analyze car-following behavior using the Open ACC dataset. Without fine-tuning, Chronos outperforms traditional models like IDM and Exponential smoothing with trend and seasonality (ETS), and achieves similar results to deep learning models such as DeepAR and TFT, with an RMSE of 0.60. After fine-tuning, Chronos reduces the error to an RMSE of 0.53, representing a 33.75% improvement over IDM and a 12-37% reduction compared to machine learning models like ETS and deep learning models including DeepAR, WaveNet, and TFT. This demonstrates the potential of foundation models to significantly advance transportation research, offering a scalable, adaptable, and highly accurate approach to predicting and simulating car-following behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07034v1-abstract-full').style.display = 'none'; document.getElementById('2501.07034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05445">arXiv:2501.05445</a> <span> [<a href="https://arxiv.org/pdf/2501.05445">pdf</a>, <a href="https://arxiv.org/format/2501.05445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Consistent Flow Distillation for Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Runjie Yan</a>, <a href="/search/?searchtype=author&query=Chen%2C+Y">Yinbo Chen</a>, <a href="/search/?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05445v1-abstract-short" style="display: inline;"> Score Distillation Sampling (SDS) has made significant strides in distilling image-generative models for 3D generation. However, its maximum-likelihood-seeking behavior often leads to degraded visual quality and diversity, limiting its effectiveness in 3D applications. In this work, we propose Consistent Flow Distillation (CFD), which addresses these limitations. We begin by leveraging the gradien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05445v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05445v1-abstract-full" style="display: none;"> Score Distillation Sampling (SDS) has made significant strides in distilling image-generative models for 3D generation. However, its maximum-likelihood-seeking behavior often leads to degraded visual quality and diversity, limiting its effectiveness in 3D applications. In this work, we propose Consistent Flow Distillation (CFD), which addresses these limitations. We begin by leveraging the gradient of the diffusion ODE or SDE sampling process to guide the 3D generation. From the gradient-based sampling perspective, we find that the consistency of 2D image flows across different viewpoints is important for high-quality 3D generation. To achieve this, we introduce multi-view consistent Gaussian noise on the 3D object, which can be rendered from various viewpoints to compute the flow gradient. Our experiments demonstrate that CFD, through consistent flows, significantly outperforms previous methods in text-to-3D generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05445v1-abstract-full').style.display = 'none'; document.getElementById('2501.05445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://runjie-yan.github.io/cfd/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04584">arXiv:2501.04584</a> <span> [<a href="https://arxiv.org/pdf/2501.04584">pdf</a>, <a href="https://arxiv.org/ps/2501.04584">ps</a>, <a href="https://arxiv.org/format/2501.04584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> A Direct-adjoint Approach for Material Point Model Calibration with Application to Plasticity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Ryan Yan</a>, <a href="/search/?searchtype=author&query=Seidl%2C+D+T">D. Thomas Seidl</a>, <a href="/search/?searchtype=author&query=Jones%2C+R+E">Reese E. Jones</a>, <a href="/search/?searchtype=author&query=Papadopoulos%2C+P">Panayiotis Papadopoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04584v1-abstract-short" style="display: inline;"> This paper proposes a new approach for the calibration of material parameters in elastoplastic constitutive models. The calibration is posed as a constrained optimization problem, where the constitutive evolution equations serve as constraints. The objective function quantifies the mismatch between the stress predicted by the model and corresponding experimental measurements. To improve calibratio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04584v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04584v1-abstract-full" style="display: none;"> This paper proposes a new approach for the calibration of material parameters in elastoplastic constitutive models. The calibration is posed as a constrained optimization problem, where the constitutive evolution equations serve as constraints. The objective function quantifies the mismatch between the stress predicted by the model and corresponding experimental measurements. To improve calibration efficiency, a novel direct-adjoint approach is presented to compute the Hessian of the objective function, which enables the use of second-order optimization algorithms. Automatic differentiation (AD) is used for gradient and Hessian computations. Two numerical examples are employed to validate the Hessian matrices and to demonstrate that the Newton-Raphson algorithm consistently outperforms gradient-based algorithms such as L-BFGS-B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04584v1-abstract-full').style.display = 'none'; document.getElementById('2501.04584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> SAND2025-00046O </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04070">arXiv:2501.04070</a> <span> [<a href="https://arxiv.org/pdf/2501.04070">pdf</a>, <a href="https://arxiv.org/format/2501.04070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> More is not always better? Enhancing Many-Shot In-Context Learning with Differentiated and Reweighting Objectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+X">Xiaoqing Zhang</a>, <a href="/search/?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yuhan Liu</a>, <a href="/search/?searchtype=author&query=Sung%2C+F">Flood Sung</a>, <a href="/search/?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/?searchtype=author&query=Shang%2C+S">Shuo Shang</a>, <a href="/search/?searchtype=author&query=Chen%2C+X">Xiuying Chen</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04070v2-abstract-short" style="display: inline;"> Large language models (LLMs) excel at few-shot in-context learning (ICL) without requiring parameter updates. However, as the number of ICL demonstrations increases from a few to many, performance tends to plateau and eventually decline. We identify two primary causes for this trend: the suboptimal negative log-likelihood (NLL) optimization objective and the incremental data noise. To address thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04070v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04070v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04070v2-abstract-full" style="display: none;"> Large language models (LLMs) excel at few-shot in-context learning (ICL) without requiring parameter updates. However, as the number of ICL demonstrations increases from a few to many, performance tends to plateau and eventually decline. We identify two primary causes for this trend: the suboptimal negative log-likelihood (NLL) optimization objective and the incremental data noise. To address these issues, we introduce DrICL, a novel optimization method that enhances model performance through Differentiated Learning and advantage-based Reweighting objectives. Globally, DrICL utilizes differentiated learning to optimize the NLL objective, ensuring that many-shot performance surpasses zero-shot levels. Locally, it dynamically adjusts the weighting of many-shot demonstrations by leveraging cumulative advantages inspired by reinforcement learning, thereby improving generalization. This approach allows the model to handle varying numbers of shots effectively, mitigating the impact of noisy data. Recognizing the lack of multi-task datasets with diverse many-shot distributions, we develop the Many-Shot ICL Benchmark (ICL-50)-a large-scale benchmark of 50 tasks that cover shot numbers from 1 to 350 within sequences of up to 8,000 tokens-for fine-tuning purposes. ICL-50 facilitates the evaluation of many-shot ICL strategies across seven prominent NLP tasks and 50 distinct datasets. Experimental results demonstrate that LLMs enhanced with DrICL achieve significant improvements in many-shot setups across various tasks, including both in-domain and out-of-domain scenarios. We release the code and benchmark dataset hoping to facilitate further research in many-shot ICL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04070v2-abstract-full').style.display = 'none'; document.getElementById('2501.04070v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures, 11 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03847">arXiv:2501.03847</a> <span> [<a href="https://arxiv.org/pdf/2501.03847">pdf</a>, <a href="https://arxiv.org/format/2501.03847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Gu%2C+Z">Zekai Gu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Lu%2C+J">Jiahao Lu</a>, <a href="/search/?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/?searchtype=author&query=Si%2C+C">Chenyang Si</a>, <a href="/search/?searchtype=author&query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/?searchtype=author&query=Lin%2C+C">Cheng Lin</a>, <a href="/search/?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03847v2-abstract-short" style="display: inline;"> Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03847v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03847v2-abstract-full" style="display: none;"> Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse control demands. In this paper, we introduce Diffusion as Shader (DaS), a novel approach that supports multiple video control tasks within a unified architecture. Our key insight is that achieving versatile video control necessitates leveraging 3D control signals, as videos are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods limited to 2D control signals, DaS leverages 3D tracking videos as control inputs, making the video diffusion process inherently 3D-aware. This innovation allows DaS to achieve a wide range of video controls by simply manipulating the 3D tracking videos. A further advantage of using 3D tracking videos is their ability to effectively link frames, significantly enhancing the temporal consistency of the generated videos. With just 3 days of fine-tuning on 8 H800 GPUs using less than 10k videos, DaS demonstrates strong control capabilities across diverse tasks, including mesh-to-video generation, camera control, motion transfer, and object manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03847v2-abstract-full').style.display = 'none'; document.getElementById('2501.03847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://igl-hkust.github.io/das/ Codes: https://github.com/IGL-HKUST/DiffusionAsShader</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01422">arXiv:2501.01422</a> <span> [<a href="https://arxiv.org/pdf/2501.01422">pdf</a>, <a href="https://arxiv.org/format/2501.01422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Video Feature Extraction for Popularity Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+H">Haixu Liu</a>, <a href="/search/?searchtype=author&query=Wang%2C+W">Wenning Wang</a>, <a href="/search/?searchtype=author&query=Zheng%2C+H">Haoxiang Zheng</a>, <a href="/search/?searchtype=author&query=Jiang%2C+P">Penghao Jiang</a>, <a href="/search/?searchtype=author&query=Wang%2C+Q">Qirui Wang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ruiqing Yan</a>, <a href="/search/?searchtype=author&query=Sun%2C+Q">Qiuzhuang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01422v1-abstract-short" style="display: inline;"> This work aims to predict the popularity of short videos using the videos themselves and their related features. Popularity is measured by four key engagement metrics: view count, like count, comment count, and share count. This study employs video classification models with different architectures and training methods as backbone networks to extract video modality features. Meanwhile, the cleaned… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01422v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01422v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01422v1-abstract-full" style="display: none;"> This work aims to predict the popularity of short videos using the videos themselves and their related features. Popularity is measured by four key engagement metrics: view count, like count, comment count, and share count. This study employs video classification models with different architectures and training methods as backbone networks to extract video modality features. Meanwhile, the cleaned video captions are incorporated into a carefully designed prompt framework, along with the video, as input for video-to-text generation models, which generate detailed text-based video content understanding. These texts are then encoded into vectors using a pre-trained BERT model. Based on the six sets of vectors mentioned above, a neural network is trained for each of the four prediction metrics. Moreover, the study conducts data mining and feature engineering based on the video and tabular data, constructing practical features such as the total frequency of hashtag appearances, the total frequency of mention appearances, video duration, frame count, frame rate, and total time online. Multiple machine learning models are trained, and the most stable model, XGBoost, is selected. Finally, the predictions from the neural network and XGBoost models are averaged to obtain the final result. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01422v1-abstract-full').style.display = 'none'; document.getElementById('2501.01422v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INFORMS 2024 Data Challenge Competition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18279">arXiv:2412.18279</a> <span> [<a href="https://arxiv.org/pdf/2412.18279">pdf</a>, <a href="https://arxiv.org/format/2412.18279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Multi-Step Reasoning Abilities of Large Language Models with Direct Advantage Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+J">Jiacai Liu</a>, <a href="/search/?searchtype=author&query=Wang%2C+C">Chaojie Wang</a>, <a href="/search/?searchtype=author&query=Liu%2C+C+Y">Chris Yuhao Liu</a>, <a href="/search/?searchtype=author&query=Zeng%2C+L">Liang Zeng</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Sun%2C+Y">Yiwen Sun</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/?searchtype=author&query=Zhou%2C+Y">Yahui Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18279v1-abstract-short" style="display: inline;"> The role of reinforcement learning (RL) in enhancing the reasoning of large language models (LLMs) is becoming increasingly significant. Despite the success of RL in many scenarios, there are still many challenges in improving the reasoning of LLMs. One challenge is the sparse reward, which makes optimization difficult for RL and necessitates a large amount of data samples. Another challenge stems… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18279v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18279v1-abstract-full" style="display: none;"> The role of reinforcement learning (RL) in enhancing the reasoning of large language models (LLMs) is becoming increasingly significant. Despite the success of RL in many scenarios, there are still many challenges in improving the reasoning of LLMs. One challenge is the sparse reward, which makes optimization difficult for RL and necessitates a large amount of data samples. Another challenge stems from the inherent instability of RL, particularly when using Actor-Critic (AC) methods to derive optimal policies, which often leads to unstable training processes. To address these issues, we introduce Direct Advantage Policy Optimization (DAPO), an novel step-level offline RL algorithm. Unlike standard alignment that rely solely outcome rewards to optimize policies (such as DPO), DAPO employs a critic function to predict the reasoning accuracy at each step, thereby generating dense signals to refine the generation strategy. Additionally, the Actor and Critic components in DAPO are trained independently, avoiding the co-training instability observed in standard AC algorithms like PPO. We train DAPO on mathematical and code query datasets and then evaluate its performance on multiple benchmarks. Our results show that DAPO can effectively enhance the mathematical and code capabilities on both SFT models and RL models, demonstrating the effectiveness of DAPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18279v1-abstract-full').style.display = 'none'; document.getElementById('2412.18279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18176">arXiv:2412.18176</a> <span> [<a href="https://arxiv.org/pdf/2412.18176">pdf</a>, <a href="https://arxiv.org/format/2412.18176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Molar: Multimodal LLMs with Collaborative Filtering Alignment for Enhanced Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Luo%2C+Y">Yucong Luo</a>, <a href="/search/?searchtype=author&query=Qin%2C+Q">Qitao Qin</a>, <a href="/search/?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/?searchtype=author&query=Cheng%2C+M">Mingyue Cheng</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ruiran Yan</a>, <a href="/search/?searchtype=author&query=Wang%2C+K">Kefan Wang</a>, <a href="/search/?searchtype=author&query=Ouyang%2C+J">Jie Ouyang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18176v2-abstract-short" style="display: inline;"> Sequential recommendation (SR) systems have evolved significantly over the past decade, transitioning from traditional collaborative filtering to deep learning approaches and, more recently, to large language models (LLMs). While the adoption of LLMs has driven substantial advancements, these models inherently lack collaborative filtering information, relying primarily on textual content data negl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18176v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18176v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18176v2-abstract-full" style="display: none;"> Sequential recommendation (SR) systems have evolved significantly over the past decade, transitioning from traditional collaborative filtering to deep learning approaches and, more recently, to large language models (LLMs). While the adoption of LLMs has driven substantial advancements, these models inherently lack collaborative filtering information, relying primarily on textual content data neglecting other modalities and thus failing to achieve optimal recommendation performance. To address this limitation, we propose Molar, a Multimodal large language sequential recommendation framework that integrates multiple content modalities with ID information to capture collaborative signals effectively. Molar employs an MLLM to generate unified item representations from both textual and non-textual data, facilitating comprehensive multimodal modeling and enriching item embeddings. Additionally, it incorporates collaborative filtering signals through a post-alignment mechanism, which aligns user representations from content-based and ID-based models, ensuring precise personalization and robust performance. By seamlessly combining multimodal content with collaborative filtering insights, Molar captures both user interests and contextual semantics, leading to superior recommendation accuracy. Extensive experiments validate that Molar significantly outperforms traditional and LLM-based baselines, highlighting its strength in utilizing multimodal data and collaborative signals for sequential recommendation tasks. The source code is available at https://anonymous.4open.science/r/Molar-8B06/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18176v2-abstract-full').style.display = 'none'; document.getElementById('2412.18176v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15649">arXiv:2412.15649</a> <span> [<a href="https://arxiv.org/pdf/2412.15649">pdf</a>, <a href="https://arxiv.org/format/2412.15649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SLAM-Omni: Timbre-Controllable Voice Interaction System with Single-Stage Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Chen%2C+W">Wenxi Chen</a>, <a href="/search/?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ruiqi Yan</a>, <a href="/search/?searchtype=author&query=Liang%2C+Y">Yuzhe Liang</a>, <a href="/search/?searchtype=author&query=Li%2C+X">Xiquan Li</a>, <a href="/search/?searchtype=author&query=Xu%2C+R">Ruiyang Xu</a>, <a href="/search/?searchtype=author&query=Niu%2C+Z">Zhikang Niu</a>, <a href="/search/?searchtype=author&query=Zhu%2C+Y">Yanqiao Zhu</a>, <a href="/search/?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/?searchtype=author&query=Liu%2C+Z">Zhanxun Liu</a>, <a href="/search/?searchtype=author&query=Yu%2C+K">Kai Yu</a>, <a href="/search/?searchtype=author&query=Hu%2C+Y">Yuxuan Hu</a>, <a href="/search/?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Yan Lu</a>, <a href="/search/?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/?searchtype=author&query=Chen%2C+X">Xie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15649v1-abstract-short" style="display: inline;"> Recent advancements highlight the potential of end-to-end real-time spoken dialogue systems, showcasing their low latency and high quality. In this paper, we introduce SLAM-Omni, a timbre-controllable, end-to-end voice interaction system with single-stage training. SLAM-Omni achieves zero-shot timbre control by modeling spoken language with semantic tokens and decoupling speaker information to a v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15649v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15649v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15649v1-abstract-full" style="display: none;"> Recent advancements highlight the potential of end-to-end real-time spoken dialogue systems, showcasing their low latency and high quality. In this paper, we introduce SLAM-Omni, a timbre-controllable, end-to-end voice interaction system with single-stage training. SLAM-Omni achieves zero-shot timbre control by modeling spoken language with semantic tokens and decoupling speaker information to a vocoder. By predicting grouped speech semantic tokens at each step, our method significantly reduces the sequence length of audio tokens, accelerating both training and inference. Additionally, we propose historical text prompting to compress dialogue history, facilitating efficient multi-round interactions. Comprehensive evaluations reveal that SLAM-Omni outperforms prior models of similar scale, requiring only 15 hours of training on 4 GPUs with limited data. Notably, it is the first spoken dialogue system to achieve competitive performance with a single-stage training approach, eliminating the need for pre-training on TTS or ASR tasks. Further experiments validate its multilingual and multi-turn dialogue capabilities on larger datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15649v1-abstract-full').style.display = 'none'; document.getElementById('2412.15649v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15634">arXiv:2412.15634</a> <span> [<a href="https://arxiv.org/pdf/2412.15634">pdf</a>, <a href="https://arxiv.org/format/2412.15634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Darkit: A User-Friendly Software Toolkit for Spiking Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+X">Xin Du</a>, <a href="/search/?searchtype=author&query=Ye%2C+S">Shifan Ye</a>, <a href="/search/?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/?searchtype=author&query=Hu%2C+Y">Yangfan Hu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Qi%2C+S">Shunyu Qi</a>, <a href="/search/?searchtype=author&query=Chen%2C+S">Shuyang Chen</a>, <a href="/search/?searchtype=author&query=Tang%2C+H">Huajin Tang</a>, <a href="/search/?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/?searchtype=author&query=Deng%2C+S">Shuiguang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15634v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been widely applied in various practical applications, typically comprising billions of parameters, with inference processes requiring substantial energy and computational resources. In contrast, the human brain, employing bio-plausible spiking mechanisms, can accomplish the same tasks while significantly reducing energy consumption, even with a similar number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15634v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15634v1-abstract-full" style="display: none;"> Large language models (LLMs) have been widely applied in various practical applications, typically comprising billions of parameters, with inference processes requiring substantial energy and computational resources. In contrast, the human brain, employing bio-plausible spiking mechanisms, can accomplish the same tasks while significantly reducing energy consumption, even with a similar number of parameters. Based on this, several pioneering researchers have proposed and implemented various large language models that leverage spiking neural networks. They have demonstrated the feasibility of these models, validated their performance, and open-sourced their frameworks and partial source code. To accelerate the adoption of brain-inspired large language models and facilitate secondary development for researchers, we are releasing a software toolkit named DarwinKit (Darkit). The toolkit is designed specifically for learners, researchers, and developers working on spiking large models, offering a suite of highly user-friendly features that greatly simplify the learning, deployment, and development processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15634v1-abstract-full').style.display = 'none'; document.getElementById('2412.15634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14523">arXiv:2412.14523</a> <span> [<a href="https://arxiv.org/pdf/2412.14523">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> </div> </div> <p class="title is-5 mathjax"> Provincial allocation of China's commercial building operational carbon towards carbon neutrality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Deng%2C+Y">Yanqiao Deng</a>, <a href="/search/?searchtype=author&query=Ma%2C+M">Minda Ma</a>, <a href="/search/?searchtype=author&query=Zhou%2C+N">Nan Zhou</a>, <a href="/search/?searchtype=author&query=Zou%2C+C">Chenchen Zou</a>, <a href="/search/?searchtype=author&query=Ma%2C+Z">Zhili Ma</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ran Yan</a>, <a href="/search/?searchtype=author&query=Ma%2C+X">Xin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14523v2-abstract-short" style="display: inline;"> National carbon peak track and optimized provincial carbon allocations are crucial for mitigating regional inequality within the commercial building sector during China's transition to carbon neutrality. This study proposes a top-down model to evaluate carbon trajectories in operational commercial buildings up to 2060. Through Monte Carlo simulation, scenario analysis is conducted to assess carbon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14523v2-abstract-full').style.display = 'inline'; document.getElementById('2412.14523v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14523v2-abstract-full" style="display: none;"> National carbon peak track and optimized provincial carbon allocations are crucial for mitigating regional inequality within the commercial building sector during China's transition to carbon neutrality. This study proposes a top-down model to evaluate carbon trajectories in operational commercial buildings up to 2060. Through Monte Carlo simulation, scenario analysis is conducted to assess carbon peak values and the corresponding peaking year, thereby optimizing carbon allocation schemes both nationwide and provincially. The results reveal that (1) the nationwide carbon peak for commercial building operations is projected to reach 890 (+- 50) megatons of carbon dioxide (MtCO2) by 2028 (+- 3.7 years) in the case of the business-as-usual scenario, with a 7.87% probability of achieving the carbon peak under the decarbonization scenario. (2) Significant disparities will exist among provinces, with Shandong's carbon peak projected at 69.6 (+- 4.0) MtCO2 by 2029, approximately 11 times higher than Ningxia's peak of 6.0 (+- 0.3) MtCO2 by 2027. (3) Guided by the principle of maximizing the emission reduction potential, the optimal provincial allocation scheme reveals the top three provinces requiring the most significant reductions in the commercial sector: Xinjiang (5.6 MtCO2), Shandong (4.8 MtCO2), and Henan (4.7 MtCO2). Overall, this study offers optimized provincial carbon allocation strategies within the commercial building sector in China via dynamic scenario simulations, with the goal of hitting the carbon peak target and progressing toward a low-carbon future for the building sector. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14523v2-abstract-full').style.display = 'none'; document.getElementById('2412.14523v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07778">arXiv:2412.07778</a> <span> [<a href="https://arxiv.org/pdf/2412.07778">pdf</a>, <a href="https://arxiv.org/format/2412.07778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MIN: Multi-channel Interaction Network for Drug-Target Interaction with Protein Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Li%2C+S">Shuqi Li</a>, <a href="/search/?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/?searchtype=author&query=Sun%2C+H">Hongda Sun</a>, <a href="/search/?searchtype=author&query=Chen%2C+Y">Yuhan Chen</a>, <a href="/search/?searchtype=author&query=Qin%2C+T">Tao Qin</a>, <a href="/search/?searchtype=author&query=Ke%2C+T">Tianjun Ke</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07778v1-abstract-short" style="display: inline;"> Traditional drug discovery processes are both time-consuming and require extensive professional expertise. With the accumulation of drug-target interaction (DTI) data from experimental studies, leveraging modern machine-learning techniques to discern patterns between drugs and target proteins has become increasingly feasible. In this paper, we introduce the Multi-channel Interaction Network (MIN),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07778v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07778v1-abstract-full" style="display: none;"> Traditional drug discovery processes are both time-consuming and require extensive professional expertise. With the accumulation of drug-target interaction (DTI) data from experimental studies, leveraging modern machine-learning techniques to discern patterns between drugs and target proteins has become increasingly feasible. In this paper, we introduce the Multi-channel Interaction Network (MIN), a novel framework designed to predict DTIs through two primary components: a representation learning module and a multi-channel interaction module. The representation learning module features a C-Score Predictor-assisted screening mechanism, which selects critical residues to enhance prediction accuracy and reduce noise. The multi-channel interaction module incorporates a structure-agnostic channel, a structure-aware channel, and an extended-mixture channel, facilitating the identification of interaction patterns at various levels for optimal complementarity. Additionally, contrastive learning is utilized to harmonize the representations of diverse data types. Our experimental evaluations on public datasets demonstrate that MIN surpasses other strong DTI prediction methods. Furthermore, the case study reveals a high overlap between the residues selected by the C-Score Predictor and those in actual binding pockets, underscoring MIN's explainability capability. These findings affirm that MIN is not only a potent tool for DTI prediction but also offers fresh insights into the prediction of protein binding sites. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07778v1-abstract-full').style.display = 'none'; document.getElementById('2412.07778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06360">arXiv:2412.06360</a> <span> [<a href="https://arxiv.org/pdf/2412.06360">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> </div> </div> <p class="title is-5 mathjax"> India's residential space cooling transition: Decarbonization ambitions since the turn of millennium </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Ran Yan</a>, <a href="/search/?searchtype=author&query=Zhou%2C+N">Nan Zhou</a>, <a href="/search/?searchtype=author&query=Ma%2C+M">Minda Ma</a>, <a href="/search/?searchtype=author&query=Mao%2C+C">Chao Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06360v3-abstract-short" style="display: inline;"> As an emerging emitter poised for significant growth in space cooling demand, India requires comprehensive insights into historical emission trends and decarbonization performance to shape future low-carbon cooling strategies. By integrating a bottom-up demand resource energy analysis model and a top-down decomposition method, this study is the first to conduct a state-level analysis of carbon emi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06360v3-abstract-full').style.display = 'inline'; document.getElementById('2412.06360v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06360v3-abstract-full" style="display: none;"> As an emerging emitter poised for significant growth in space cooling demand, India requires comprehensive insights into historical emission trends and decarbonization performance to shape future low-carbon cooling strategies. By integrating a bottom-up demand resource energy analysis model and a top-down decomposition method, this study is the first to conduct a state-level analysis of carbon emission trends and the corresponding decarbonization efforts for residential space cooling in urban and rural India from 2000 to 2022. The results indicate that (1) the carbon intensity of residential space cooling in India increased by 292.4% from 2000 to 2022, reaching 513.8 kilograms of carbon dioxide per household. The net state domestic product per capita, representing income, emerged as the primary positive contributor. (2) The increase in carbon emissions from space cooling can be primarily attributed to the use of fans. While fan-based space cooling has nearly saturated Indian urban households, it is anticipated to persist as the primary cooling method in rural households for decades. (3) States with higher decarbonization potential are concentrated in two categories: those with high household income and substantial cooling appliance ownership and those with pronounced unmet cooling demand but low household income and hot climates. Furthermore, it is believed that promoting energy-efficient building designs can be prioritized to achieve affordable space cooling. Overall, this study serves as an effective foundation for formulating and promoting India's future cooling action plan, addressing the country's rising residential cooling demands and striving toward its net-zero goal by 2070. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06360v3-abstract-full').style.display = 'none'; document.getElementById('2412.06360v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00699">arXiv:2412.00699</a> <span> [<a href="https://arxiv.org/pdf/2412.00699">pdf</a>, <a href="https://arxiv.org/format/2412.00699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> </div> </div> <p class="title is-5 mathjax"> A one-dimensional mixing model for the impact of ablative Rayleigh-Taylor instability on compression dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+D">Dongxue Liu</a>, <a href="/search/?searchtype=author&query=Tao%2C+T">Tao Tao</a>, <a href="/search/?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/?searchtype=author&query=Jia%2C+Q">Qing Jia</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Zheng%2C+J">Jian Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00699v2-abstract-short" style="display: inline;"> A one-dimensional mixing model, incorporating the effects of laser ablation and initial perturbations, is developed to study the influence of ablative Rayleigh-Taylor instability on compression dynamics. The length of the mixing region is determined with the buoyancy-drag model[arXiv:2411.12392v2 (2024)]. The mixing effect on laser ablation is mainly described with an additional heat source which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00699v2-abstract-full').style.display = 'inline'; document.getElementById('2412.00699v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00699v2-abstract-full" style="display: none;"> A one-dimensional mixing model, incorporating the effects of laser ablation and initial perturbations, is developed to study the influence of ablative Rayleigh-Taylor instability on compression dynamics. The length of the mixing region is determined with the buoyancy-drag model[arXiv:2411.12392v2 (2024)]. The mixing effect on laser ablation is mainly described with an additional heat source which depends on turbulent kinetic energy and initial perturbation level through a free multiplier. The model is integrated into a one-dimensional radiation hydrodynamics code and validated against two-dimensional planar simulations. The further application of our model to spherical implosion simulations reveals that the model can give reasonable predictions of implosion degradation due to mixing, such as lowered shell compression, reduced stagnation pressure, and decreased areal density, etc. It is found that the time interval between the convergence of the main shock and stagnation may offer an estimate of mixing level in single-shot experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00699v2-abstract-full').style.display = 'none'; document.getElementById('2412.00699v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18328">arXiv:2411.18328</a> <span> [<a href="https://arxiv.org/pdf/2411.18328">pdf</a>, <a href="https://arxiv.org/format/2411.18328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EventCrab: Harnessing Frame and Point Synergy for Event-based Action Recognition and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Cao%2C+M">Meiqi Cao</a>, <a href="/search/?searchtype=author&query=Shu%2C+X">Xiangbo Shu</a>, <a href="/search/?searchtype=author&query=Zhang%2C+J">Jiachao Zhang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Li%2C+Z">Zechao Li</a>, <a href="/search/?searchtype=author&query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18328v1-abstract-short" style="display: inline;"> Event-based Action Recognition (EAR) possesses the advantages of high-temporal resolution capturing and privacy preservation compared with traditional action recognition. Current leading EAR solutions typically follow two regimes: project unconstructed event streams into dense constructed event frames and adopt powerful frame-specific networks, or employ lightweight point-specific networks to hand… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18328v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18328v1-abstract-full" style="display: none;"> Event-based Action Recognition (EAR) possesses the advantages of high-temporal resolution capturing and privacy preservation compared with traditional action recognition. Current leading EAR solutions typically follow two regimes: project unconstructed event streams into dense constructed event frames and adopt powerful frame-specific networks, or employ lightweight point-specific networks to handle sparse unconstructed event points directly. However, such two regimes are blind to a fundamental issue: failing to accommodate the unique dense temporal and sparse spatial properties of asynchronous event data. In this article, we present a synergy-aware framework, i.e., EventCrab, that adeptly integrates the "lighter" frame-specific networks for dense event frames with the "heavier" point-specific networks for sparse event points, balancing accuracy and efficiency. Furthermore, we establish a joint frame-text-point representation space to bridge distinct event frames and points. In specific, to better exploit the unique spatiotemporal relationships inherent in asynchronous event points, we devise two strategies for the "heavier" point-specific embedding: i) a Spiking-like Context Learner (SCL) that extracts contextualized event points from raw event streams. ii) an Event Point Encoder (EPE) that further explores event-point long spatiotemporal features in a Hilbert-scan way. Experiments on four datasets demonstrate the significant performance of our proposed EventCrab, particularly gaining improvements of 5.17% on SeAct and 7.01% on HARDVS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18328v1-abstract-full').style.display = 'none'; document.getElementById('2411.18328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12392">arXiv:2411.12392</a> <span> [<a href="https://arxiv.org/pdf/2411.12392">pdf</a>, <a href="https://arxiv.org/format/2411.12392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> </div> </div> <p class="title is-5 mathjax"> A buoyancy-drag model with a time-varying drag coefficient for evaluating bubble front penetration depth </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+D">Dongxue Liu</a>, <a href="/search/?searchtype=author&query=Tao%2C+T">Tao Tao</a>, <a href="/search/?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/?searchtype=author&query=Jia%2C+Q">Qing Jia</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Zheng%2C+J">Jian Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12392v2-abstract-short" style="display: inline;"> To evaluate and control bubble front penetration depth ${{h}_{B}}$ induced by ablative Rayleigh-Taylor instability (ARTI) from a weakly nonlinear phase to a self-similar phase, we first propose an improved buoyancy-drag (BD) model with a time-varying drag coefficient. The coefficient incorporates the influence of multiple physical mechanisms, including non-steady ablation, preheating, and other me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12392v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12392v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12392v2-abstract-full" style="display: none;"> To evaluate and control bubble front penetration depth ${{h}_{B}}$ induced by ablative Rayleigh-Taylor instability (ARTI) from a weakly nonlinear phase to a self-similar phase, we first propose an improved buoyancy-drag (BD) model with a time-varying drag coefficient. The coefficient incorporates the influence of multiple physical mechanisms, including non-steady ablation, preheating, and other mechanisms during this phase. The model is validated through simulations under various conditions, demonstrating improved accuracy compared to the classical BD model and the self-similar growth. Furthermore, the model suggests controlling ${{h}_{B}}$ by suppressing the "most dangerous mode", which is influenced by initial perturbations and ablative acceleration history, thus offering novel insights for target manufacturing and pulse optimization near the ignition threshold. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12392v2-abstract-full').style.display = 'none'; document.getElementById('2411.12392v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10709">arXiv:2411.10709</a> <span> [<a href="https://arxiv.org/pdf/2411.10709">pdf</a>, <a href="https://arxiv.org/format/2411.10709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diagnostic Text-guided Representation Learning in Hierarchical Classification for Pathological Whole Slide Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/?searchtype=author&query=Sun%2C+Q">Qiehe Sun</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Renao Yan</a>, <a href="/search/?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/?searchtype=author&query=Fu%2C+Y">Yuqiu Fu</a>, <a href="/search/?searchtype=author&query=Wei%2C+Y">Yani Wei</a>, <a href="/search/?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/?searchtype=author&query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/?searchtype=author&query=He%2C+Y">Yonghonghe He</a>, <a href="/search/?searchtype=author&query=Han%2C+A">Anjia Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10709v1-abstract-short" style="display: inline;"> With the development of digital imaging in medical microscopy, artificial intelligent-based analysis of pathological whole slide images (WSIs) provides a powerful tool for cancer diagnosis. Limited by the expensive cost of pixel-level annotation, current research primarily focuses on representation learning with slide-level labels, showing success in various downstream tasks. However, given the di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10709v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10709v1-abstract-full" style="display: none;"> With the development of digital imaging in medical microscopy, artificial intelligent-based analysis of pathological whole slide images (WSIs) provides a powerful tool for cancer diagnosis. Limited by the expensive cost of pixel-level annotation, current research primarily focuses on representation learning with slide-level labels, showing success in various downstream tasks. However, given the diversity of lesion types and the complex relationships between each other, these techniques still deserve further exploration in addressing advanced pathology tasks. To this end, we introduce the concept of hierarchical pathological image classification and propose a representation learning called PathTree. PathTree considers the multi-classification of diseases as a binary tree structure. Each category is represented as a professional pathological text description, which messages information with a tree-like encoder. The interactive text features are then used to guide the aggregation of hierarchical multiple representations. PathTree uses slide-text similarity to obtain probability scores and introduces two extra tree specific losses to further constrain the association between texts and slides. Through extensive experiments on three challenging hierarchical classification datasets: in-house cryosectioned lung tissue lesion identification, public prostate cancer grade assessment, and public breast cancer subtyping, our proposed PathTree is consistently competitive compared to the state-of-the-art methods and provides a new perspective on the deep learning-assisted solution for more complex WSI classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10709v1-abstract-full').style.display = 'none'; document.getElementById('2411.10709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07176">arXiv:2411.07176</a> <span> [<a href="https://arxiv.org/pdf/2411.07176">pdf</a>, <a href="https://arxiv.org/format/2411.07176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> More Expressive Attention with Negative Weights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/?searchtype=author&query=Liao%2C+J">Jiayi Liao</a>, <a href="/search/?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07176v3-abstract-short" style="display: inline;"> We propose a novel attention mechanism, named Cog Attention, that enables attention weights to be negative for enhanced expressiveness, which stems from two key factors: (1) Cog Attention enhances parameter flexibility. For example, unlike traditional softmax attention heads that use a static output-value (OV) matrix to delete or copy inputs that the heads attend to, Cog Attention naturally learns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07176v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07176v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07176v3-abstract-full" style="display: none;"> We propose a novel attention mechanism, named Cog Attention, that enables attention weights to be negative for enhanced expressiveness, which stems from two key factors: (1) Cog Attention enhances parameter flexibility. For example, unlike traditional softmax attention heads that use a static output-value (OV) matrix to delete or copy inputs that the heads attend to, Cog Attention naturally learns to use the sign of dynamic query-key (QK) inner products to represent these operations. This enables Cog Attention to perform multiple operations simultaneously within a single head. Meanwhile, Cog Attention's OV matrix can focus more on refinement or modification. (2) Cog Attention enhances the model's robustness against representational collapse by preventing the ``over-squashing'' of earlier tokens into later positions. We develop Transformer-like models which use Cog Attention as attention modules, including decoder-only models at various scales for language modeling and U-ViT diffusion models for image generation. Experiments show that models using Cog Attention exhibit superior performance compared to those employing traditional softmax attention modules. Our approach suggests a promising research direction for rethinking and breaking the entrenched constraints of traditional softmax attention, such as the requirement for non-negative weights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07176v3-abstract-full').style.display = 'none'; document.getElementById('2411.07176v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06818">arXiv:2411.06818</a> <span> [<a href="https://arxiv.org/pdf/2411.06818">pdf</a>, <a href="https://arxiv.org/format/2411.06818">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Mechanics">cond-mat.stat-mech</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Soft Condensed Matter">cond-mat.soft</span> </div> </div> <p class="title is-5 mathjax"> Mechanism of the Nonequilibrium Phase Transition in Self-Propelled Particles with Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Ruizhe Yan</a>, <a href="/search/?searchtype=author&query=Su%2C+J">Jie Su</a>, <a href="/search/?searchtype=author&query=Wang%2C+J">Jin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06818v1-abstract-short" style="display: inline;"> Self-propelled particles with alignment, displaying ordered collective motions such as swarming, can be investigated by the well-known Vicsek model. However, challenges still remain regarding the nature of the associated phase transition. Here, we use the landscape-flux approach combined with the coarse-grained mapping method to reveal the underlying mechanism of the continuous or discontinuous or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06818v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06818v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06818v1-abstract-full" style="display: none;"> Self-propelled particles with alignment, displaying ordered collective motions such as swarming, can be investigated by the well-known Vicsek model. However, challenges still remain regarding the nature of the associated phase transition. Here, we use the landscape-flux approach combined with the coarse-grained mapping method to reveal the underlying mechanism of the continuous or discontinuous order-disorder nonequilibrium phase transition in Vicsek model systems featuring diverse noise characteristics. It is found that the nonequilibrium flux inside the landscape in the density-alignment degree phase space always rotates counterclockwise, and tends to delocalize or destabilize the point attractor states, providing the dynamical driving force for altering the landscape shape and the system state. Furthermore, the variations in the averaged flux and entropy production rate exhibit pronounced differences across various noise types. This not only helps to reveal the dynamical and thermodynamical mechanisms of the order-disorder transition but also offers a useful tool to recognize the continuity of the transition. Our findings present a novel perspective for exploring nonequilibrium phase transition behaviors and other collective motions in various complex systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06818v1-abstract-full').style.display = 'none'; document.getElementById('2411.06818v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06391">arXiv:2411.06391</a> <span> [<a href="https://arxiv.org/pdf/2411.06391">pdf</a>, <a href="https://arxiv.org/format/2411.06391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CausalStock: Deep End-to-end Causal Discovery for News-driven Stock Movement Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Li%2C+S">Shuqi Li</a>, <a href="/search/?searchtype=author&query=Sun%2C+Y">Yuebo Sun</a>, <a href="/search/?searchtype=author&query=Lin%2C+Y">Yuxin Lin</a>, <a href="/search/?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/?searchtype=author&query=Shang%2C+S">Shuo Shang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06391v1-abstract-short" style="display: inline;"> There are two issues in news-driven multi-stock movement prediction tasks that are not well solved in the existing works. On the one hand, "relation discovery" is a pivotal part when leveraging the price information of other stocks to achieve accurate stock movement prediction. Given that stock relations are often unidirectional, such as the "supplier-consumer" relationship, causal relations are m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06391v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06391v1-abstract-full" style="display: none;"> There are two issues in news-driven multi-stock movement prediction tasks that are not well solved in the existing works. On the one hand, "relation discovery" is a pivotal part when leveraging the price information of other stocks to achieve accurate stock movement prediction. Given that stock relations are often unidirectional, such as the "supplier-consumer" relationship, causal relations are more appropriate to capture the impact between stocks. On the other hand, there is substantial noise existing in the news data leading to extracting effective information with difficulty. With these two issues in mind, we propose a novel framework called CausalStock for news-driven multi-stock movement prediction, which discovers the temporal causal relations between stocks. We design a lag-dependent temporal causal discovery mechanism to model the temporal causal graph distribution. Then a Functional Causal Model is employed to encapsulate the discovered causal relations and predict the stock movements. Additionally, we propose a Denoised News Encoder by taking advantage of the excellent text evaluation ability of large language models (LLMs) to extract useful information from massive news data. The experiment results show that CausalStock outperforms the strong baselines for both news-driven multi-stock movement prediction and multi-stock movement prediction tasks on six real-world datasets collected from the US, China, Japan, and UK markets. Moreover, getting benefit from the causal relations, CausalStock could offer a clear prediction mechanism with good explainability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06391v1-abstract-full').style.display = 'none'; document.getElementById('2411.06391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05928">arXiv:2411.05928</a> <span> [<a href="https://arxiv.org/pdf/2411.05928">pdf</a>, <a href="https://arxiv.org/format/2411.05928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reducing Distraction in Long-Context Language Models by Focused Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Wu%2C+Z">Zijun Wu</a>, <a href="/search/?searchtype=author&query=Liu%2C+B">Bingyuan Liu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ran Yan</a>, <a href="/search/?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/?searchtype=author&query=Delteil%2C+T">Thomas Delteil</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05928v1-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have significantly enhanced their capacity to process long contexts. However, effectively utilizing this long context remains a challenge due to the issue of distraction, where irrelevant information dominates lengthy contexts, causing LLMs to lose focus on the most relevant segments. To address this, we propose a novel training method that enhan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05928v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05928v1-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have significantly enhanced their capacity to process long contexts. However, effectively utilizing this long context remains a challenge due to the issue of distraction, where irrelevant information dominates lengthy contexts, causing LLMs to lose focus on the most relevant segments. To address this, we propose a novel training method that enhances LLMs' ability to discern relevant information through a unique combination of retrieval-based data augmentation and contrastive learning. Specifically, during fine-tuning with long contexts, we employ a retriever to extract the most relevant segments, serving as augmented inputs. We then introduce an auxiliary contrastive learning objective to explicitly ensure that outputs from the original context and the retrieved sub-context are closely aligned. Extensive experiments on long single-document and multi-document QA benchmarks demonstrate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05928v1-abstract-full').style.display = 'none'; document.getElementById('2411.05928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01143">arXiv:2411.01143</a> <span> [<a href="https://arxiv.org/pdf/2411.01143">pdf</a>, <a href="https://arxiv.org/format/2411.01143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Large-scale Time-aware Agents Simulation for Influencer Selection in Digital Advertising Campaigns </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+X">Xiaoqing Zhang</a>, <a href="/search/?searchtype=author&query=Chen%2C+X">Xiuying Chen</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yuhan Liu</a>, <a href="/search/?searchtype=author&query=Wang%2C+J">Jianzhou Wang</a>, <a href="/search/?searchtype=author&query=Hu%2C+Z">Zhenxing Hu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01143v1-abstract-short" style="display: inline;"> In the digital world, influencers are pivotal as opinion leaders, shaping the views and choices of their influencees. Modern advertising often follows this trend, where marketers choose appropriate influencers for product endorsements, based on thorough market analysis. Previous studies on influencer selection have typically relied on numerical representations of individual opinions and interactio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01143v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01143v1-abstract-full" style="display: none;"> In the digital world, influencers are pivotal as opinion leaders, shaping the views and choices of their influencees. Modern advertising often follows this trend, where marketers choose appropriate influencers for product endorsements, based on thorough market analysis. Previous studies on influencer selection have typically relied on numerical representations of individual opinions and interactions, a method that simplifies the intricacies of social dynamics. In this work, we first introduce a Time-aware Influencer Simulator (TIS), helping promoters identify and select the right influencers to market their products, based on LLM simulation. To validate our approach, we conduct experiments on the public advertising campaign dataset SAGraph which encompasses social relationships, posts, and user interactions. The results show that our method outperforms traditional numerical feature-based approaches and methods using limited LLM agents. Our research shows that simulating user timelines and content lifecycles over time simplifies scaling, allowing for large-scale agent simulations in social networks. Additionally, LLM-based agents for social recommendations and advertising offer substantial benefits for decision-making in promotional campaigns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01143v1-abstract-full').style.display = 'none'; document.getElementById('2411.01143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19064">arXiv:2410.19064</a> <span> [<a href="https://arxiv.org/pdf/2410.19064">pdf</a>, <a href="https://arxiv.org/format/2410.19064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From a Tiny Slip to a Giant Leap: An LLM-Based Simulation for Fake News Evolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+Y">Yuhan Liu</a>, <a href="/search/?searchtype=author&query=Song%2C+Z">Zirui Song</a>, <a href="/search/?searchtype=author&query=Zhang%2C+X">Xiaoqing Zhang</a>, <a href="/search/?searchtype=author&query=Chen%2C+X">Xiuying Chen</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19064v1-abstract-short" style="display: inline;"> With the growing spread of misinformation online, research has increasingly focused on detecting and tracking fake news. However, an overlooked issue is that fake news does not naturally exist in social networks -- it often originates from distorted facts or deliberate fabrication by malicious actors. Understanding how true news gradually evolves into fake news is critical for early detection and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19064v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19064v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19064v1-abstract-full" style="display: none;"> With the growing spread of misinformation online, research has increasingly focused on detecting and tracking fake news. However, an overlooked issue is that fake news does not naturally exist in social networks -- it often originates from distorted facts or deliberate fabrication by malicious actors. Understanding how true news gradually evolves into fake news is critical for early detection and prevention, reducing its spread and impact. Hence, in this paper, we take the first step toward simulating and revealing this evolution, proposing a Fake News evolUtion Simulation framEwork (FUSE) based on large language models (LLMs). Specifically, we employ LLM as agents to represent individuals in a simulated social network. We define four types of agents commonly observed in daily interactions: spreaders, who propagate information; commentators, who provide opinions and interpretations; verifiers, who check the accuracy of information; and bystanders, who passively observe without engaging. For simulated environments, we model various social network structures, such as high-clustering networks and scale-free networks, to mirror real-world network dynamics. Each day, the agents engage in belief exchanges, reflect on their thought processes, and reintroduce the news accordingly. Given the lack of prior work in this area, we developed a FUSE-EVAL evaluation framework to measure the deviation from true news during the fake news evolution process. The results show that FUSE successfully captures the underlying patterns of how true news transforms into fake news and accurately reproduces previously discovered instances of fake news, aligning closely with human evaluations. Moreover, our work provides insights into the fact that combating fake news should not be delayed until it has fully evolved; instead, prevention in advance is key to achieving better outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19064v1-abstract-full').style.display = 'none'; document.getElementById('2410.19064v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18451">arXiv:2410.18451</a> <span> [<a href="https://arxiv.org/pdf/2410.18451">pdf</a>, <a href="https://arxiv.org/format/2410.18451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+C+Y">Chris Yuhao Liu</a>, <a href="/search/?searchtype=author&query=Zeng%2C+L">Liang Zeng</a>, <a href="/search/?searchtype=author&query=Liu%2C+J">Jiacai Liu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=He%2C+J">Jujie He</a>, <a href="/search/?searchtype=author&query=Wang%2C+C">Chaojie Wang</a>, <a href="/search/?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/?searchtype=author&query=Zhou%2C+Y">Yahui Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18451v1-abstract-short" style="display: inline;"> In this report, we introduce a collection of methods to enhance reward modeling for LLMs, focusing specifically on data-centric techniques. We propose effective data selection and filtering strategies for curating high-quality open-source preference datasets, culminating in the Skywork-Reward data collection, which contains only 80K preference pairs -- significantly smaller than existing datasets.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18451v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18451v1-abstract-full" style="display: none;"> In this report, we introduce a collection of methods to enhance reward modeling for LLMs, focusing specifically on data-centric techniques. We propose effective data selection and filtering strategies for curating high-quality open-source preference datasets, culminating in the Skywork-Reward data collection, which contains only 80K preference pairs -- significantly smaller than existing datasets. Using this curated dataset, we developed the Skywork-Reward model series -- Skywork-Reward-Gemma-27B and Skywork-Reward-Llama-3.1-8B -- with the former currently holding the top position on the RewardBench leaderboard. Notably, our techniques and datasets have directly enhanced the performance of many top-ranked models on RewardBench, highlighting the practical impact of our contributions in real-world preference learning applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18451v1-abstract-full').style.display = 'none'; document.getElementById('2410.18451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15689">arXiv:2410.15689</a> <span> [<a href="https://arxiv.org/pdf/2410.15689">pdf</a>, <a href="https://arxiv.org/format/2410.15689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Enhancing SNN-based Spatio-Temporal Learning: A Benchmark Dataset and Cross-Modality Attention Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhou%2C+S">Shibo Zhou</a>, <a href="/search/?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/?searchtype=author&query=Yuan%2C+M">Mengwen Yuan</a>, <a href="/search/?searchtype=author&query=Jiang%2C+R">Runhao Jiang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/?searchtype=author&query=Tang%2C+H">Huajin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15689v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs), renowned for their low power consumption, brain-inspired architecture, and spatio-temporal representation capabilities, have garnered considerable attention in recent years. Similar to Artificial Neural Networks (ANNs), high-quality benchmark datasets are of great importance to the advances of SNNs. However, our analysis indicates that many prevalent neuromorphic da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15689v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15689v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs), renowned for their low power consumption, brain-inspired architecture, and spatio-temporal representation capabilities, have garnered considerable attention in recent years. Similar to Artificial Neural Networks (ANNs), high-quality benchmark datasets are of great importance to the advances of SNNs. However, our analysis indicates that many prevalent neuromorphic datasets lack strong temporal correlation, preventing SNNs from fully exploiting their spatio-temporal representation capabilities. Meanwhile, the integration of event and frame modalities offers more comprehensive visual spatio-temporal information. Yet, the SNN-based cross-modality fusion remains underexplored. In this work, we present a neuromorphic dataset called DVS-SLR that can better exploit the inherent spatio-temporal properties of SNNs. Compared to existing datasets, it offers advantages in terms of higher temporal correlation, larger scale, and more varied scenarios. In addition, our neuromorphic dataset contains corresponding frame data, which can be used for developing SNN-based fusion methods. By virtue of the dual-modal feature of the dataset, we propose a Cross-Modality Attention (CMA) based fusion method. The CMA model efficiently utilizes the unique advantages of each modality, allowing for SNNs to learn both temporal and spatial attention scores from the spatio-temporal features of event and frame modalities, subsequently allocating these scores across modalities to enhance their synergy. Experimental results demonstrate that our method not only improves recognition accuracy but also ensures robustness across diverse scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15689v1-abstract-full').style.display = 'none'; document.getElementById('2410.15689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14799">arXiv:2410.14799</a> <span> [<a href="https://arxiv.org/pdf/2410.14799">pdf</a>, <a href="https://arxiv.org/format/2410.14799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Generic Dynamic Object Detection Based on Dynamic Grid Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Rujiao Yan</a>, <a href="/search/?searchtype=author&query=Schubert%2C+L">Linda Schubert</a>, <a href="/search/?searchtype=author&query=Kamm%2C+A">Alexander Kamm</a>, <a href="/search/?searchtype=author&query=Komar%2C+M">Matthias Komar</a>, <a href="/search/?searchtype=author&query=Schreier%2C+M">Matthias Schreier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14799v1-abstract-short" style="display: inline;"> This paper describes a method to detect generic dynamic objects for automated driving. First, a LiDAR-based dynamic grid is generated online. Second, a deep learning-based detector is trained on the dynamic grid to infer the presence of dynamic objects of any type, which is a prerequisite for safe automated vehicles in arbitrary, edge-case scenarios. The Rotation-equivariant Detector (ReDet) - ori… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14799v1-abstract-full" style="display: none;"> This paper describes a method to detect generic dynamic objects for automated driving. First, a LiDAR-based dynamic grid is generated online. Second, a deep learning-based detector is trained on the dynamic grid to infer the presence of dynamic objects of any type, which is a prerequisite for safe automated vehicles in arbitrary, edge-case scenarios. The Rotation-equivariant Detector (ReDet) - originally designed for oriented object detection on aerial images - was chosen due to its high detection performance. Experiments are conducted based on real sensor data and the benefits in comparison to classic dynamic cell clustering strategies are highlighted. The false positive object detection rate is strongly reduced by the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14799v1-abstract-full').style.display = 'none'; document.getElementById('2410.14799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, IEEE IV24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13052">arXiv:2410.13052</a> <span> [<a href="https://arxiv.org/pdf/2410.13052">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Exploring Nanoscale Photoresponse Mechanisms for Enhanced Photothermoelectric Effects in van der Waals Interfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Xu%2C+D">Da Xu</a>, <a href="/search/?searchtype=author&query=Liu%2C+Q">Qiushi Liu</a>, <a href="/search/?searchtype=author&query=Liang%2C+B">Boqun Liang</a>, <a href="/search/?searchtype=author&query=Yu%2C+N">Ning Yu</a>, <a href="/search/?searchtype=author&query=Ma%2C+X">Xuezhi Ma</a>, <a href="/search/?searchtype=author&query=Xu%2C+Y">Yaodong Xu</a>, <a href="/search/?searchtype=author&query=Taniguchi%2C+T">Takashi Taniguchi</a>, <a href="/search/?searchtype=author&query=Lake%2C+R+K">Roger K. Lake</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ruoxue Yan</a>, <a href="/search/?searchtype=author&query=Liu%2C+M">Ming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13052v1-abstract-short" style="display: inline;"> Integrated photodetectors are crucial for their high speed, sensitivity, and efficient power consumption. In these devices, photocurrent generation is primarily attributed to the photovoltaic (PV) effect, driven by electron hole separations, and the photothermoelectric (PTE) effect, which results from temperature gradients via the Seebeck effect. As devices shrink, the overlap of these mechanisms-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13052v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13052v1-abstract-full" style="display: none;"> Integrated photodetectors are crucial for their high speed, sensitivity, and efficient power consumption. In these devices, photocurrent generation is primarily attributed to the photovoltaic (PV) effect, driven by electron hole separations, and the photothermoelectric (PTE) effect, which results from temperature gradients via the Seebeck effect. As devices shrink, the overlap of these mechanisms-both dependent on the Fermi level and band structure-complicates their separate evaluation at the nanoscale. This study introduces a novel 3D photocurrent nano-imaging technique specifically designed to distinctly map these mechanisms in a Schottky barrier photodiode featuring a molybdenum disulfide and gold (MoS2 Au) interface. We uncover a significant PTE-dominated region extending several hundred nanometers from the electrode edge, a characteristic facilitated by the weak electrostatic forces typical in 2D materials. Unexpectedly, we find that incorporating hexagonal boron nitride (hBN), known for its high thermal conductivity, markedly enhances the PTE response. This counterintuitive enhancement stems from an optimal overlap between thermal and Seebeck profiles, presenting a new pathway to boost device performance. Our findings highlight the capability of this imaging technique to not only advance optoelectronic applications but also to deepen our understanding of light matter interactions within low-dimensional systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13052v1-abstract-full').style.display = 'none'; document.getElementById('2410.13052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12850">arXiv:2410.12850</a> <span> [<a href="https://arxiv.org/pdf/2410.12850">pdf</a>, <a href="https://arxiv.org/format/2410.12850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RecurFormer: Not All Transformer Heads Need Self-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Ruiqing Yan</a>, <a href="/search/?searchtype=author&query=Zheng%2C+L">Linghan Zheng</a>, <a href="/search/?searchtype=author&query=Du%2C+X">Xingbo Du</a>, <a href="/search/?searchtype=author&query=Zou%2C+H">Han Zou</a>, <a href="/search/?searchtype=author&query=Guo%2C+Y">Yufeng Guo</a>, <a href="/search/?searchtype=author&query=Yang%2C+J">Jianfei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12850v1-abstract-short" style="display: inline;"> Transformer-based large language models (LLMs) excel in modeling complex language patterns but face significant computational costs during inference, especially with long inputs due to the attention mechanism's memory overhead. We observe that certain attention heads exhibit a distribution where the attention weights concentrate on tokens near the query token, termed as recency aware, which focuse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12850v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12850v1-abstract-full" style="display: none;"> Transformer-based large language models (LLMs) excel in modeling complex language patterns but face significant computational costs during inference, especially with long inputs due to the attention mechanism's memory overhead. We observe that certain attention heads exhibit a distribution where the attention weights concentrate on tokens near the query token, termed as recency aware, which focuses on local and short-range dependencies. Leveraging this insight, we propose RecurFormer, a novel architecture that replaces these attention heads with linear recurrent neural networks (RNNs), specifically the Mamba architecture. This replacement reduces the cache size without evicting tokens, thus maintaining generation quality. RecurFormer retains the ability to model long-range dependencies through the remaining attention heads and allows for reusing pre-trained Transformer-based LLMs weights with continual training. Experiments demonstrate that RecurFormer matches the original model's performance while significantly enhancing inference efficiency. Our approach provides a practical solution to the computational challenges of Transformer-based LLMs inference, making it highly attractive for tasks involving long inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12850v1-abstract-full').style.display = 'none'; document.getElementById('2410.12850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12460">arXiv:2410.12460</a> <span> [<a href="https://arxiv.org/pdf/2410.12460">pdf</a>, <a href="https://arxiv.org/format/2410.12460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Plasma Physics">physics.plasm-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Phenomenology">hep-ph</span> </div> </div> <p class="title is-5 mathjax"> Efficient generation of divergent and collimated hot electrons via a novel multi-beam two-plasmon decay and stimulated Raman scattering mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Meng%2C+K+Y">K. Y. Meng</a>, <a href="/search/?searchtype=author&query=Cai%2C+Z+H">Z. H. Cai</a>, <a href="/search/?searchtype=author&query=Li%2C+J">J. Li</a>, <a href="/search/?searchtype=author&query=Yao%2C+C">C. Yao</a>, <a href="/search/?searchtype=author&query=Hao%2C+L">L. Hao</a>, <a href="/search/?searchtype=author&query=Zhou%2C+F+X">F. X. Zhou</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">R. Yan</a>, <a href="/search/?searchtype=author&query=Zheng%2C+J">J. Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12460v2-abstract-short" style="display: inline;"> In inertial confinement fusion (ICF) implosions, the preheating risks associated with hot electrons generated by laser plasma instabilities (LPI) are contingent upon the angular characteristics of these hot electrons for a given total energy. Using particle-in-cell simulations, we reveal a novel multi-beam collaborative mechanism of two-plasmon decay (TPD) and stimulated Raman scattering (SRS), an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12460v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12460v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12460v2-abstract-full" style="display: none;"> In inertial confinement fusion (ICF) implosions, the preheating risks associated with hot electrons generated by laser plasma instabilities (LPI) are contingent upon the angular characteristics of these hot electrons for a given total energy. Using particle-in-cell simulations, we reveal a novel multi-beam collaborative mechanism of two-plasmon decay (TPD) and stimulated Raman scattering (SRS), and investigate the angular variations of hot electrons generated from this shared TPD-SRS (STS) instability driven collectively by dual laser beams with varying incident angles $胃_{in}$ ($24^\circ$ to $55^\circ$ at the incident plane) for typical ICF conditions. In the simulations with $胃_{in}\gtrsim44^\circ$, STS emerges as the dominant mechanism responsible for hot electron generation, leading to a wide angular distribution of hot electrons that exhibit both pronounced divergent and collimated components. The common Langmuir wave associated with STS plays a crucial role in accelerating both components.By properly modeling the STS common wave gains, we establish scaling relations between these gains and the energies of collimated and divergent hot electrons. These relations reveal that the divergent hot electrons are more sensitive to variations in gain compared to the collimated electrons. Additionally, the calculated gains qualitatively predict the asymmetry in hot electron angular distributions when the density gradients deviate from the bisector of the laser beams. Our findings offers insights for hot electron generation with multiple beams, potentially complementing previous experiments that underscore the critical role of overlapped intensity from symmetric beams within the same cone and the dominance of dual-beam coupling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12460v2-abstract-full').style.display = 'none'; document.getElementById('2410.12460v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11647">arXiv:2410.11647</a> <span> [<a href="https://arxiv.org/pdf/2410.11647">pdf</a>, <a href="https://arxiv.org/format/2410.11647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Measuring Spiritual Values and Bias of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+S">Songyuan Liu</a>, <a href="/search/?searchtype=author&query=Zhang%2C+Z">Ziyang Zhang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Runze Yan</a>, <a href="/search/?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/?searchtype=author&query=Lu%2C+J">Jiaying Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11647v1-abstract-short" style="display: inline;"> Large language models (LLMs) have become integral tool for users from various backgrounds. LLMs, trained on vast corpora, reflect the linguistic and cultural nuances embedded in their pre-training data. However, the values and perspectives inherent in this data can influence the behavior of LLMs, leading to potential biases. As a result, the use of LLMs in contexts involving spiritual or moral val… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11647v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11647v1-abstract-full" style="display: none;"> Large language models (LLMs) have become integral tool for users from various backgrounds. LLMs, trained on vast corpora, reflect the linguistic and cultural nuances embedded in their pre-training data. However, the values and perspectives inherent in this data can influence the behavior of LLMs, leading to potential biases. As a result, the use of LLMs in contexts involving spiritual or moral values necessitates careful consideration of these underlying biases. Our work starts with verification of our hypothesis by testing the spiritual values of popular LLMs. Experimental results show that LLMs' spiritual values are quite diverse, as opposed to the stereotype of atheists or secularists. We then investigate how different spiritual values affect LLMs in social-fairness scenarios e.g., hate speech identification). Our findings reveal that different spiritual values indeed lead to different sensitivity to different hate target groups. Furthermore, we propose to continue pre-training LLMs on spiritual texts, and empirical results demonstrate the effectiveness of this approach in mitigating spiritual bias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11647v1-abstract-full').style.display = 'none'; document.getElementById('2410.11647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages including appendix; 5 figures; 5 tables; submitted to ARR - Octobor 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05067">arXiv:2410.05067</a> <span> [<a href="https://arxiv.org/pdf/2410.05067">pdf</a>, <a href="https://arxiv.org/format/2410.05067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1051/0004-6361/202451339">10.1051/0004-6361/202451339 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Nebular Dust Attenuation with the Balmer and Paschen Lines based on the MaNGA Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Lin%2C+Z">Zesen Lin</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Renbin Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05067v1-abstract-short" style="display: inline;"> Dust attenuations observed by stars and ionized gas are not necessarily the same. The lack of observational constraints on the nebular dust attenuation curve leaves a large uncertainty when correcting nebular dust attenuation with stellar continuum-based attenuation curves. Making use of the DAP catalogs of the MaNGA survey, we investigate the nebular dust attenuation of HII regions traced by the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05067v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05067v1-abstract-full" style="display: none;"> Dust attenuations observed by stars and ionized gas are not necessarily the same. The lack of observational constraints on the nebular dust attenuation curve leaves a large uncertainty when correcting nebular dust attenuation with stellar continuum-based attenuation curves. Making use of the DAP catalogs of the MaNGA survey, we investigate the nebular dust attenuation of HII regions traced by the Balmer and Paschen lines. Based on a simple simulation, we find that star-forming regions on kpc scales favor the classic foreground screen dust model rather than the uniform mixture model. We propose a novel approach to fit the dust attenuation curve using the emission-line fluxes directly. For strong hydrogen recombination lines (e.g., H$纬$, H$未$, and H$蔚$), the slopes of the nebular attenuation curve can be well determined and are found to be in good agreement with the Fitzpatrick Milky Way extinction curve with an accuracy of $\lesssim 4\%$ in terms of the correction factor. However, severe contaminations/systematic uncertainties prevent us from obtaining reasonable values of the slopes for weak recombination lines (e.g., the high-order Balmer lines or the Paschen lines). We discuss how the choice of emission line measurement methods affects the results. Our results demonstrate the difficulty of deriving an average nebular dust attenuation curve given the current ground-based emission-line measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05067v1-abstract-full').style.display = 'none'; document.getElementById('2410.05067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in A&A, 19 pages, 10 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> A&A 691, A201 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04498">arXiv:2410.04498</a> <span> [<a href="https://arxiv.org/pdf/2410.04498">pdf</a>, <a href="https://arxiv.org/format/2410.04498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AdaMemento: Adaptive Memory-Assisted Policy Optimization for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Renye Yan</a>, <a href="/search/?searchtype=author&query=Gan%2C+Y">Yaozhong Gan</a>, <a href="/search/?searchtype=author&query=Wu%2C+Y">You Wu</a>, <a href="/search/?searchtype=author&query=Xing%2C+J">Junliang Xing</a>, <a href="/search/?searchtype=author&query=Liangn%2C+L">Ling Liangn</a>, <a href="/search/?searchtype=author&query=Zhu%2C+Y">Yeshang Zhu</a>, <a href="/search/?searchtype=author&query=Cai%2C+Y">Yimao Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04498v1-abstract-short" style="display: inline;"> In sparse reward scenarios of reinforcement learning (RL), the memory mechanism provides promising shortcuts to policy optimization by reflecting on past experiences like humans. However, current memory-based RL methods simply store and reuse high-value policies, lacking a deeper refining and filtering of diverse past experiences and hence limiting the capability of memory. In this paper, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04498v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04498v1-abstract-full" style="display: none;"> In sparse reward scenarios of reinforcement learning (RL), the memory mechanism provides promising shortcuts to policy optimization by reflecting on past experiences like humans. However, current memory-based RL methods simply store and reuse high-value policies, lacking a deeper refining and filtering of diverse past experiences and hence limiting the capability of memory. In this paper, we propose AdaMemento, an adaptive memory-enhanced RL framework. Instead of just memorizing positive past experiences, we design a memory-reflection module that exploits both positive and negative experiences by learning to predict known local optimal policies based on real-time states. To effectively gather informative trajectories for the memory, we further introduce a fine-grained intrinsic motivation paradigm, where nuances in similar states can be precisely distinguished to guide exploration. The exploitation of past experiences and exploration of new policies are then adaptively coordinated by ensemble learning to approach the global optimum. Furthermore, we theoretically prove the superiority of our new intrinsic motivation and ensemble mechanism. From 59 quantitative and visualization experiments, we confirm that AdaMemento can distinguish subtle states for better exploration and effectively exploiting past experiences in memory, achieving significant improvement over previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04498v1-abstract-full').style.display = 'none'; document.getElementById('2410.04498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04454">arXiv:2410.04454</a> <span> [<a href="https://arxiv.org/pdf/2410.04454">pdf</a>, <a href="https://arxiv.org/format/2410.04454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Inner-Probe: Discovering Copyright-related Data Generation in LLM Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Ma%2C+Q">Qichao Ma</a>, <a href="/search/?searchtype=author&query=Zhu%2C+R">Rui-Jie Zhu</a>, <a href="/search/?searchtype=author&query=Liu%2C+P">Peiye Liu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Renye Yan</a>, <a href="/search/?searchtype=author&query=Zhang%2C+F">Fahong Zhang</a>, <a href="/search/?searchtype=author&query=Liang%2C+L">Ling Liang</a>, <a href="/search/?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/?searchtype=author&query=Yu%2C+Z">Zhaofei Yu</a>, <a href="/search/?searchtype=author&query=Wang%2C+Z">Zongwei Wang</a>, <a href="/search/?searchtype=author&query=Cai%2C+Y">Yimao Cai</a>, <a href="/search/?searchtype=author&query=Huang%2C+T">Tiejun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04454v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) utilize extensive knowledge databases and show powerful text generation ability. However, their reliance on high-quality copyrighted datasets raises concerns about copyright infringements in generated texts. Current research often employs prompt engineering or semantic classifiers to identify copyrighted content, but these approaches have two significant limitations: (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04454v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04454v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04454v2-abstract-full" style="display: none;"> Large Language Models (LLMs) utilize extensive knowledge databases and show powerful text generation ability. However, their reliance on high-quality copyrighted datasets raises concerns about copyright infringements in generated texts. Current research often employs prompt engineering or semantic classifiers to identify copyrighted content, but these approaches have two significant limitations: (1) Challenging to identify which specific sub-dataset (e.g., works from particular authors) influences an LLM's output. (2) Treating the entire training database as copyrighted, hence overlooking the inclusion of non-copyrighted training data. We propose InnerProbe, a lightweight framework designed to evaluate the influence of copyrighted sub-datasets on LLM-generated texts. Unlike traditional methods relying solely on text, we discover that the results of multi-head attention (MHA) during LLM output generation provide more effective information. Thus, InnerProbe performs sub-dataset contribution analysis using a lightweight LSTM-based network trained on MHA results in a supervised manner. Harnessing such a prior, InnerProbe enables non-copyrighted text detection through a concatenated global projector trained with unsupervised contrastive learning. InnerProbe demonstrates 3x improved efficiency compared to semantic model training in sub-dataset contribution analysis on Books3, achieves 15.04%-58.7% higher accuracy over baselines on the Pile, and delivers a 0.104 increase in AUC for non-copyrighted data filtering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04454v2-abstract-full').style.display = 'none'; document.getElementById('2410.04454v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19745">arXiv:2409.19745</a> <span> [<a href="https://arxiv.org/pdf/2409.19745">pdf</a>, <a href="https://arxiv.org/format/2409.19745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PEAR: Position-Embedding-Agnostic Attention Re-weighting Enhances Retrieval-Augmented Generation with Zero Inference Overhead </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Tan%2C+T">Tao Tan</a>, <a href="/search/?searchtype=author&query=Qian%2C+Y">Yining Qian</a>, <a href="/search/?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/?searchtype=author&query=Wu%2C+S">Songhao Wu</a>, <a href="/search/?searchtype=author&query=Wang%2C+Y">Yongbo Wang</a>, <a href="/search/?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/?searchtype=author&query=Wu%2C+J">Jingtong Wu</a>, <a href="/search/?searchtype=author&query=Lu%2C+X">Xin Lu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19745v2-abstract-short" style="display: inline;"> Large language models (LLMs) enhanced with retrieval-augmented generation (RAG) have introduced a new paradigm for web search. However, the limited context awareness of LLMs degrades their performance on RAG tasks. Existing methods to enhance context awareness are often inefficient, incurring time or memory overhead during inference, and many are tailored to specific position embeddings. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19745v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19745v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19745v2-abstract-full" style="display: none;"> Large language models (LLMs) enhanced with retrieval-augmented generation (RAG) have introduced a new paradigm for web search. However, the limited context awareness of LLMs degrades their performance on RAG tasks. Existing methods to enhance context awareness are often inefficient, incurring time or memory overhead during inference, and many are tailored to specific position embeddings. In this paper, we propose Position-Embedding-Agnostic attention Re-weighting (PEAR), which enhances the context awareness of LLMs with zero inference overhead. Specifically, on a proxy task focused on context copying, we first detect heads which suppress the models' context awareness thereby diminishing RAG performance. To weaken the impact of these heads, we re-weight their outputs with learnable coefficients. The LLM (with frozen parameters) is optimized by adjusting these coefficients to minimize loss on the proxy task. As a result, the coefficients are optimized to values less than one, thereby reducing their tendency to suppress RAG performance. During inference, the optimized coefficients are fixed to re-weight these heads, regardless of the specific task at hand. Our proposed PEAR offers two major advantages over previous approaches: (1) It introduces zero additional inference overhead in terms of memory usage or inference time, while outperforming competitive baselines in accuracy and efficiency across various RAG tasks. (2) It is independent of position embedding algorithms, ensuring broader applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19745v2-abstract-full').style.display = 'none'; document.getElementById('2409.19745v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19700">arXiv:2409.19700</a> <span> [<a href="https://arxiv.org/pdf/2409.19700">pdf</a>, <a href="https://arxiv.org/format/2409.19700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> 2D-TPE: Two-Dimensional Positional Encoding Enhances Table Understanding for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Li%2C+J">Jia-Nan Li</a>, <a href="/search/?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/?searchtype=author&query=Yu%2C+Z">Zhengtao Yu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19700v3-abstract-short" style="display: inline;"> Tables are ubiquitous across various domains for concisely representing structured information. Empowering large language models (LLMs) to reason over tabular data represents an actively explored direction. However, since typical LLMs only support one-dimensional~(1D) inputs, existing methods often flatten the two-dimensional~(2D) table structure into a sequence of tokens, which can severely disru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19700v3-abstract-full').style.display = 'inline'; document.getElementById('2409.19700v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19700v3-abstract-full" style="display: none;"> Tables are ubiquitous across various domains for concisely representing structured information. Empowering large language models (LLMs) to reason over tabular data represents an actively explored direction. However, since typical LLMs only support one-dimensional~(1D) inputs, existing methods often flatten the two-dimensional~(2D) table structure into a sequence of tokens, which can severely disrupt the spatial relationships and result in an inevitable loss of vital contextual information. In this paper, we first empirically demonstrate the detrimental impact of such flattening operations on the performance of LLMs in capturing the spatial information of tables through two elaborate proxy tasks. Subsequently, we introduce a simple yet effective positional encoding method, termed ``2D-TPE'' (Two-Dimensional Table Positional Encoding), to address this challenge. 2D-TPE enables each attention head to dynamically select a permutation order of tokens within the context for attending to them, where each permutation represents a distinct traversal mode for the table, such as column-wise or row-wise traversal. 2D-TPE effectively mitigates the risk of losing essential spatial information while preserving computational efficiency, thus better preserving the table structure. Extensive experiments across five benchmarks demonstrate that 2D-TPE outperforms strong baselines, underscoring the importance of preserving the table structure for accurate table comprehension. Comprehensive analysis further reveals the substantially better scalability of 2D-TPE to large tables than baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19700v3-abstract-full').style.display = 'none'; document.getElementById('2409.19700v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11664">arXiv:2409.11664</a> <span> [<a href="https://arxiv.org/pdf/2409.11664">pdf</a>, <a href="https://arxiv.org/format/2409.11664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681425">10.1145/3664647.3681425 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Agent Aggregator with Mask Denoise Mechanism for Histopathology Whole Slide Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/?searchtype=author&query=Ouyang%2C+M">Minxi Ouyang</a>, <a href="/search/?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/?searchtype=author&query=Chen%2C+X">Xinrui Chen</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Renao Yan</a>, <a href="/search/?searchtype=author&query=Chu%2C+H">Hongbo Chu</a>, <a href="/search/?searchtype=author&query=Cheng%2C+J">Junru Cheng</a>, <a href="/search/?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/?searchtype=author&query=Tian%2C+S">Sufang Tian</a>, <a href="/search/?searchtype=author&query=Liu%2C+X">Xiaoping Liu</a>, <a href="/search/?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11664v1-abstract-short" style="display: inline;"> Histopathology analysis is the gold standard for medical diagnosis. Accurate classification of whole slide images (WSIs) and region-of-interests (ROIs) localization can assist pathologists in diagnosis. The gigapixel resolution of WSI and the absence of fine-grained annotations make direct classification and analysis challenging. In weakly supervised learning, multiple instance learning (MIL) pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11664v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11664v1-abstract-full" style="display: none;"> Histopathology analysis is the gold standard for medical diagnosis. Accurate classification of whole slide images (WSIs) and region-of-interests (ROIs) localization can assist pathologists in diagnosis. The gigapixel resolution of WSI and the absence of fine-grained annotations make direct classification and analysis challenging. In weakly supervised learning, multiple instance learning (MIL) presents a promising approach for WSI classification. The prevailing strategy is to use attention mechanisms to measure instance importance for classification. However, attention mechanisms fail to capture inter-instance information, and self-attention causes quadratic computational complexity. To address these challenges, we propose AMD-MIL, an agent aggregator with a mask denoise mechanism. The agent token acts as an intermediate variable between the query and key for computing instance importance. Mask and denoising matrices, mapped from agents-aggregated value, dynamically mask low-contribution representations and eliminate noise. AMD-MIL achieves better attention allocation by adjusting feature representations, capturing micro-metastases in cancer, and improving interpretability. Extensive experiments on CAMELYON-16, CAMELYON-17, TCGA-KIDNEY, and TCGA-LUNG show AMD-MIL's superiority over state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11664v1-abstract-full').style.display = 'none'; document.getElementById('2409.11664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11340">arXiv:2409.11340</a> <span> [<a href="https://arxiv.org/pdf/2409.11340">pdf</a>, <a href="https://arxiv.org/format/2409.11340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OmniGen: Unified Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/?searchtype=author&query=Wang%2C+Y">Yueze Wang</a>, <a href="/search/?searchtype=author&query=Zhou%2C+J">Junjie Zhou</a>, <a href="/search/?searchtype=author&query=Yuan%2C+H">Huaying Yuan</a>, <a href="/search/?searchtype=author&query=Xing%2C+X">Xingrun Xing</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Ruiran Yan</a>, <a href="/search/?searchtype=author&query=Li%2C+C">Chaofan Li</a>, <a href="/search/?searchtype=author&query=Wang%2C+S">Shuting Wang</a>, <a href="/search/?searchtype=author&query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/?searchtype=author&query=Liu%2C+Z">Zheng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11340v2-abstract-short" style="display: inline;"> The emergence of Large Language Models (LLMs) has unified language generation tasks and revolutionized human-machine interaction. However, in the realm of image generation, a unified model capable of handling various tasks within a single framework remains largely unexplored. In this work, we introduce OmniGen, a new diffusion model for unified image generation. OmniGen is characterized by the fol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11340v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11340v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11340v2-abstract-full" style="display: none;"> The emergence of Large Language Models (LLMs) has unified language generation tasks and revolutionized human-machine interaction. However, in the realm of image generation, a unified model capable of handling various tasks within a single framework remains largely unexplored. In this work, we introduce OmniGen, a new diffusion model for unified image generation. OmniGen is characterized by the following features: 1) Unification: OmniGen not only demonstrates text-to-image generation capabilities but also inherently supports various downstream tasks, such as image editing, subject-driven generation, and visual-conditional generation. 2) Simplicity: The architecture of OmniGen is highly simplified, eliminating the need for additional plugins. Moreover, compared to existing diffusion models, it is more user-friendly and can complete complex tasks end-to-end through instructions without the need for extra intermediate steps, greatly simplifying the image generation workflow. 3) Knowledge Transfer: Benefit from learning in a unified format, OmniGen effectively transfers knowledge across different tasks, manages unseen tasks and domains, and exhibits novel capabilities. We also explore the model's reasoning capabilities and potential applications of the chain-of-thought mechanism. This work represents the first attempt at a general-purpose image generation model, and we will release our resources at https://github.com/VectorSpaceLab/OmniGen to foster future advancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11340v2-abstract-full').style.display = 'none'; document.getElementById('2409.11340v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Update the paper for OmniGen-v1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09281">arXiv:2409.09281</a> <span> [<a href="https://arxiv.org/pdf/2409.09281">pdf</a>, <a href="https://arxiv.org/format/2409.09281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Language Models "Grok" to Copy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09281v2-abstract-short" style="display: inline;"> We examine the pre-training dynamics of language models, focusing on their ability to copy text from preceding context--a fundamental skill for various LLM applications, including in-context learning (ICL) and retrieval-augmented generation (RAG). We propose a novel perspective that Transformer-based language models develop copying abilities similarly to grokking, which refers to sudden generaliza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09281v2-abstract-full').style.display = 'inline'; document.getElementById('2409.09281v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09281v2-abstract-full" style="display: none;"> We examine the pre-training dynamics of language models, focusing on their ability to copy text from preceding context--a fundamental skill for various LLM applications, including in-context learning (ICL) and retrieval-augmented generation (RAG). We propose a novel perspective that Transformer-based language models develop copying abilities similarly to grokking, which refers to sudden generalization on test set long after the model fit to the training set. Our experiments yield three arguments: (1) The pre-training loss decreases rapidly, while the context copying ability of models initially lags and then abruptly saturates. (2) The speed of developing copying ability is independent of the number of tokens trained, similarly to how grokking speed is unaffected by dataset size as long as the data distribution is preserved. (3) Induction heads, the attention heads responsible for copying, form from shallow to deep layers during training, mirroring the development of circuits in deeper layers during grokking. We contend that the connection between grokking and context copying can provide valuable insights for more effective language model training, ultimately improving in-context performance. For example, we demonstrated that techniques that enhance grokking, such as regularization, either accelerate or enhance the development of context copying. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09281v2-abstract-full').style.display = 'none'; document.getElementById('2409.09281v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 main conference, short paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07967">arXiv:2409.07967</a> <span> [<a href="https://arxiv.org/pdf/2409.07967">pdf</a>, <a href="https://arxiv.org/format/2409.07967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Locality-aware Cross-modal Correspondence Learning for Dense Audio-Visual Events Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Xing%2C+L">Ling Xing</a>, <a href="/search/?searchtype=author&query=Qu%2C+H">Hongyu Qu</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/?searchtype=author&query=Shu%2C+X">Xiangbo Shu</a>, <a href="/search/?searchtype=author&query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07967v1-abstract-short" style="display: inline;"> Dense-localization Audio-Visual Events (DAVE) aims to identify time boundaries and corresponding categories for events that can be heard and seen concurrently in an untrimmed video. Existing methods typically encode audio and visual representation separately without any explicit cross-modal alignment constraint. Then they adopt dense cross-modal attention to integrate multimodal information for DA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07967v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07967v1-abstract-full" style="display: none;"> Dense-localization Audio-Visual Events (DAVE) aims to identify time boundaries and corresponding categories for events that can be heard and seen concurrently in an untrimmed video. Existing methods typically encode audio and visual representation separately without any explicit cross-modal alignment constraint. Then they adopt dense cross-modal attention to integrate multimodal information for DAVE. Thus these methods inevitably aggregate irrelevant noise and events, especially in complex and long videos, leading to imprecise detection. In this paper, we present LOCO, a Locality-aware cross-modal Correspondence learning framework for DAVE. The core idea is to explore local temporal continuity nature of audio-visual events, which serves as informative yet free supervision signals to guide the filtering of irrelevant information and inspire the extraction of complementary multimodal information during both unimodal and cross-modal learning stages. i) Specifically, LOCO applies Locality-aware Correspondence Correction (LCC) to uni-modal features via leveraging cross-modal local-correlated properties without any extra annotations. This enforces uni-modal encoders to highlight similar semantics shared by audio and visual features. ii) To better aggregate such audio and visual features, we further customize Cross-modal Dynamic Perception layer (CDP) in cross-modal feature pyramid to understand local temporal patterns of audio-visual events by imposing local consistency within multimodal features in a data-driven manner. By incorporating LCC and CDP, LOCO provides solid performance gains and outperforms existing methods for DAVE. The source code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07967v1-abstract-full').style.display = 'none'; document.getElementById('2409.07967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01143">arXiv:2409.01143</a> <span> [<a href="https://arxiv.org/pdf/2409.01143">pdf</a>, <a href="https://arxiv.org/format/2409.01143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FlashFlex: Accommodating Large Language Model Training over Heterogeneous Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yan%2C+R">Ran Yan</a>, <a href="/search/?searchtype=author&query=Jiang%2C+Y">Youhe Jiang</a>, <a href="/search/?searchtype=author&query=Tao%2C+W">Wangcheng Tao</a>, <a href="/search/?searchtype=author&query=Nie%2C+X">Xiaonan Nie</a>, <a href="/search/?searchtype=author&query=Cui%2C+B">Bin Cui</a>, <a href="/search/?searchtype=author&query=Yuan%2C+B">Binhang Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01143v1-abstract-short" style="display: inline;"> Training large language model (LLM) is a computationally intensive task, which is typically conducted in data centers with homogeneous high-performance GPUs. This paper explores an alternative approach by deploying the training computation across heterogeneous GPUs to enable better flexibility and efficiency for heterogeneous resource utilization. To achieve this goal, we propose a novel system, F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01143v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01143v1-abstract-full" style="display: none;"> Training large language model (LLM) is a computationally intensive task, which is typically conducted in data centers with homogeneous high-performance GPUs. This paper explores an alternative approach by deploying the training computation across heterogeneous GPUs to enable better flexibility and efficiency for heterogeneous resource utilization. To achieve this goal, we propose a novel system, FlashFlex, that can flexibly support an asymmetric partition of the parallel training computations across the scope of data-, pipeline-, and tensor model parallelism. We further formalize the allocation of asymmetric partitioned training computations over a set of heterogeneous GPUs as a constrained optimization problem and propose an efficient solution based on a hierarchical graph partitioning algorithm. Our approach can adaptively allocate asymmetric training computations across GPUs, fully leveraging the available computational power. We conduct extensive empirical studies to evaluate the performance of FlashFlex, where we find that when training LLMs at different scales (from 7B to 30B), FlashFlex can achieve comparable training MFU when running over a set of heterogeneous GPUs compared with the state of the art training systems running over a set of homogeneous high-performance GPUs with the same amount of total peak FLOPS. The achieved smallest gaps in MFU are 11.61% and 0.30%, depending on whether the homogeneous setting is equipped with and without RDMA. Our implementation is available at https://github.com/Relaxed-System-Lab/FlashFlex. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01143v1-abstract-full').style.display = 'none'; document.getElementById('2409.01143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14736">arXiv:2408.14736</a> <span> [<a href="https://arxiv.org/pdf/2408.14736">pdf</a>, <a href="https://arxiv.org/format/2408.14736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3673038.3673142">10.1145/3673038.3673142 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bandwidth-Aware and Overlap-Weighted Compression for Communication-Efficient Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Tang%2C+Z">Zichen Tang</a>, <a href="/search/?searchtype=author&query=Huang%2C+J">Junlin Huang</a>, <a href="/search/?searchtype=author&query=Yan%2C+R">Rudan Yan</a>, <a href="/search/?searchtype=author&query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/?searchtype=author&query=Shi%2C+S">Shaohuai Shi</a>, <a href="/search/?searchtype=author&query=Zhou%2C+A+C">Amelie Chi Zhou</a>, <a href="/search/?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14736v1-abstract-short" style="display: inline;"> Current data compression methods, such as sparsification in Federated Averaging (FedAvg), effectively enhance the communication efficiency of Federated Learning (FL). However, these methods encounter challenges such as the straggler problem and diminished model performance due to heterogeneous bandwidth and non-IID (Independently and Identically Distributed) data. To address these issues, we intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14736v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14736v1-abstract-full" style="display: none;"> Current data compression methods, such as sparsification in Federated Averaging (FedAvg), effectively enhance the communication efficiency of Federated Learning (FL). However, these methods encounter challenges such as the straggler problem and diminished model performance due to heterogeneous bandwidth and non-IID (Independently and Identically Distributed) data. To address these issues, we introduce a bandwidth-aware compression framework for FL, aimed at improving communication efficiency while mitigating the problems associated with non-IID data. First, our strategy dynamically adjusts compression ratios according to bandwidth, enabling clients to upload their models at a close pace, thus exploiting the otherwise wasted time to transmit more data. Second, we identify the non-overlapped pattern of retained parameters after compression, which results in diminished client update signals due to uniformly averaged weights. Based on this finding, we propose a parameter mask to adjust the client-averaging coefficients at the parameter level, thereby more closely approximating the original updates, and improving the training convergence under heterogeneous environments. Our evaluations reveal that our method significantly boosts model accuracy, with a maximum improvement of 13% over the uncompressed FedAvg. Moreover, it achieves a $3.37\times$ speedup in reaching the target accuracy compared to FedAvg with a Top-K compressor, demonstrating its effectiveness in accelerating convergence with compression. The integration of common compression techniques into our framework further establishes its potential as a versatile foundation for future cross-device, communication-efficient FL research, addressing critical challenges in FL and advancing the field of distributed machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14736v1-abstract-full').style.display = 'none'; document.getElementById('2408.14736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository