CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 175 results for author: <span class="mathjax">Dong, Q</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Dong%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Dong, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Dong%2C+Q&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Dong, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22764">arXiv:2503.22764</a> <span> [<a href="https://arxiv.org/pdf/2503.22764">pdf</a>, <a href="https://arxiv.org/format/2503.22764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Boosting Large Language Models with Mask Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yue Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qihua Dong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yun Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22764v1-abstract-short" style="display: inline;"> The model is usually kept integral in the mainstream large language model (LLM) fine-tuning protocols. No works have questioned whether maintaining the integrity of the model is indispensable for performance. In this work, we introduce Mask Fine-Tuning (MFT), a brand-new LLM fine-tuning paradigm to show that properly breaking the integrity of the model can surprisingly lead to improved performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22764v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22764v1-abstract-full" style="display: none;"> The model is usually kept integral in the mainstream large language model (LLM) fine-tuning protocols. No works have questioned whether maintaining the integrity of the model is indispensable for performance. In this work, we introduce Mask Fine-Tuning (MFT), a brand-new LLM fine-tuning paradigm to show that properly breaking the integrity of the model can surprisingly lead to improved performance. Specifically, MFT learns a set of binary masks supervised by the typical LLM fine-tuning objective. Extensive experiments show that MFT gains a consistent performance boost across various domains and backbones (e.g., 1.95%/1.88% average gain in coding with LLaMA2-7B/3.1-8B). Detailed procedures are provided to study the proposed MFT from different hyperparameter perspectives for better insight. In particular, MFT naturally updates the current LLM training protocol by deploying it on a complete well-trained model. This study extends the functionality of mask learning from its conventional network pruning context for model compression to a more general scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22764v1-abstract-full').style.display = 'none'; document.getElementById('2503.22764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19551">arXiv:2503.19551</a> <span> [<a href="https://arxiv.org/pdf/2503.19551">pdf</a>, <a href="https://arxiv.org/format/2503.19551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws of Synthetic Data for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zeyu Qin</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaolong Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Khademi%2C+M">Mahmoud Khademi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Awadalla%2C+H+H">Hany Hassan Awadalla</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+Y+R">Yi R. Fung</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weizhu Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Minhao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19551v2-abstract-short" style="display: inline;"> Large language models (LLMs) achieve strong performance across diverse tasks, largely driven by high-quality web data used in pre-training. However, recent studies indicate this data source is rapidly depleting. Synthetic data emerges as a promising alternative, but it remains unclear whether synthetic datasets exhibit predictable scalability comparable to raw pre-training data. In this work, we s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19551v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19551v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19551v2-abstract-full" style="display: none;"> Large language models (LLMs) achieve strong performance across diverse tasks, largely driven by high-quality web data used in pre-training. However, recent studies indicate this data source is rapidly depleting. Synthetic data emerges as a promising alternative, but it remains unclear whether synthetic datasets exhibit predictable scalability comparable to raw pre-training data. In this work, we systematically investigate the scaling laws of synthetic data by introducing SynthLLM, a scalable framework that transforms pre-training corpora into diverse, high-quality synthetic datasets. Our approach achieves this by automatically extracting and recombining high-level concepts across multiple documents using a graph algorithm. Key findings from our extensive mathematical experiments on SynthLLM include: (1) SynthLLM generates synthetic data that reliably adheres to the rectified scaling law across various model sizes; (2) Performance improvements plateau near 300B tokens; and (3) Larger models approach optimal performance with fewer training tokens. For instance, an 8B model peaks at 1T tokens, while a 3B model requires 4T. Moreover, comparisons with existing synthetic data generation and augmentation methods demonstrate that SynthLLM achieves superior performance and scalability. Our findings highlight synthetic data as a scalable and reliable alternative to organic pre-training corpora, offering a viable path toward continued improvement in model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19551v2-abstract-full').style.display = 'none'; document.getElementById('2503.19551v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08090">arXiv:2503.08090</a> <span> [<a href="https://arxiv.org/pdf/2503.08090">pdf</a>, <a href="https://arxiv.org/format/2503.08090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LATMOS: Latent Automaton Task Model from Observation Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Weixiao Zhan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiyue Dong</a>, <a href="/search/cs?searchtype=author&query=Sebasti%C3%A1n%2C+E">Eduardo Sebasti谩n</a>, <a href="/search/cs?searchtype=author&query=Atanasov%2C+N">Nikolay Atanasov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08090v1-abstract-short" style="display: inline;"> Robot task planning from high-level instructions is an important step towards deploying fully autonomous robot systems in the service sector. Three key aspects of robot task planning present challenges yet to be resolved simultaneously, namely, (i) factorization of complex tasks specifications into simpler executable subtasks, (ii) understanding of the current task state from raw observations, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08090v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08090v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08090v1-abstract-full" style="display: none;"> Robot task planning from high-level instructions is an important step towards deploying fully autonomous robot systems in the service sector. Three key aspects of robot task planning present challenges yet to be resolved simultaneously, namely, (i) factorization of complex tasks specifications into simpler executable subtasks, (ii) understanding of the current task state from raw observations, and (iii) planning and verification of task executions. To address these challenges, we propose LATMOS, an automata-inspired task model that, given observations from correct task executions, is able to factorize the task, while supporting verification and planning operations. LATMOS combines an observation encoder to extract the features from potentially high-dimensional observations with automata theory to learn a sequential model that encapsulates an automaton with symbols in the latent feature space. We conduct extensive evaluations in three task model learning setups: (i) abstract tasks described by logical formulas, (ii) real-world human tasks described by videos and natural language prompts and (iii) a robot task described by image and state observations. The results demonstrate the improved plan generation and verification capabilities of LATMOS across observation modalities and tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08090v1-abstract-full').style.display = 'none'; document.getElementById('2503.08090v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07902">arXiv:2503.07902</a> <span> [<a href="https://arxiv.org/pdf/2503.07902">pdf</a>, <a href="https://arxiv.org/format/2503.07902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LTLCodeGen: Code Generation of Syntactically Correct Temporal Logic for Robot Task Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rabiei%2C+B">Behrad Rabiei</a>, <a href="/search/cs?searchtype=author&query=R.%2C+M+K+A">Mahesh Kumar A. R.</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zhirui Dai</a>, <a href="/search/cs?searchtype=author&query=Pilla%2C+S+L+S+R">Surya L. S. R. Pilla</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiyue Dong</a>, <a href="/search/cs?searchtype=author&query=Atanasov%2C+N">Nikolay Atanasov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07902v1-abstract-short" style="display: inline;"> This paper focuses on planning robot navigation tasks from natural language specifications. We develop a modular approach, where a large language model (LLM) translates the natural language instructions into a linear temporal logic (LTL) formula with propositions defined by object classes in a semantic occupancy map. The LTL formula and the semantic occupancy map are provided to a motion planning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07902v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07902v1-abstract-full" style="display: none;"> This paper focuses on planning robot navigation tasks from natural language specifications. We develop a modular approach, where a large language model (LLM) translates the natural language instructions into a linear temporal logic (LTL) formula with propositions defined by object classes in a semantic occupancy map. The LTL formula and the semantic occupancy map are provided to a motion planning algorithm to generate a collision-free robot path that satisfies the natural language instructions. Our main contribution is LTLCodeGen, a method to translate natural language to syntactically correct LTL using code generation. We demonstrate the complete task planning method in real-world experiments involving human speech to provide navigation instructions to a mobile robot. We also thoroughly evaluate our approach in simulated and real-world experiments in comparison to end-to-end LLM task planning and state-of-the-art LLM-to-LTL translation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07902v1-abstract-full').style.display = 'none'; document.getElementById('2503.07902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06471">arXiv:2503.06471</a> <span> [<a href="https://arxiv.org/pdf/2503.06471">pdf</a>, <a href="https://arxiv.org/format/2503.06471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Online Dense Point Tracking with Streaming Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiaole Dong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06471v1-abstract-short" style="display: inline;"> Dense point tracking is a challenging task requiring the continuous tracking of every point in the initial frame throughout a substantial portion of a video, even in the presence of occlusions. Traditional methods use optical flow models to directly estimate long-range motion, but they often suffer from appearance drifting without considering temporal consistency. Recent point tracking algorithms… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06471v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06471v1-abstract-full" style="display: none;"> Dense point tracking is a challenging task requiring the continuous tracking of every point in the initial frame throughout a substantial portion of a video, even in the presence of occlusions. Traditional methods use optical flow models to directly estimate long-range motion, but they often suffer from appearance drifting without considering temporal consistency. Recent point tracking algorithms usually depend on sliding windows for indirect information propagation from the first frame to the current one, which is slow and less effective for long-range tracking. To account for temporal consistency and enable efficient information propagation, we present a lightweight and fast model with \textbf{S}treaming memory for dense \textbf{PO}int \textbf{T}racking and online video processing. The \textbf{SPOT} framework features three core components: a customized memory reading module for feature enhancement, a sensory memory for short-term motion dynamics modeling, and a visibility-guided splatting module for accurate information propagation. This combination enables SPOT to perform dense point tracking with state-of-the-art accuracy on the CVO benchmark, as well as comparable or superior performance to offline models on sparse tracking benchmarks such as TAP-Vid and RoboTAP. Notably, SPOT with 10$\times$ smaller parameter numbers operates at least 2$\times$ faster than previous state-of-the-art models while maintaining the best performance on CVO. We will release the models and codes at: https://github.com/DQiaole/SPOT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06471v1-abstract-full').style.display = 'none'; document.getElementById('2503.06471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02682">arXiv:2503.02682</a> <span> [<a href="https://arxiv.org/pdf/2503.02682">pdf</a>, <a href="https://arxiv.org/format/2503.02682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MPO: Boosting LLM Agents with Meta Plan Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weimin Xiong</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bingchan Zhao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+F">Feifan Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02682v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have enabled LLM-based agents to successfully tackle interactive planning tasks. However, despite their successes, existing approaches often suffer from planning hallucinations and require retraining for each new agent. To address these challenges, we propose the Meta Plan Optimization (MPO) framework, which enhances agent planning capabilities b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02682v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02682v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have enabled LLM-based agents to successfully tackle interactive planning tasks. However, despite their successes, existing approaches often suffer from planning hallucinations and require retraining for each new agent. To address these challenges, we propose the Meta Plan Optimization (MPO) framework, which enhances agent planning capabilities by directly incorporating explicit guidance. Unlike previous methods that rely on complex knowledge, which either require significant human effort or lack quality assurance, MPO leverages high-level general guidance through meta plans to assist agent planning and enables continuous optimization of the meta plans based on feedback from the agent's task execution. Our experiments conducted on two representative tasks demonstrate that MPO significantly outperforms existing baselines. Moreover, our analysis indicates that MPO provides a plug-and-play solution that enhances both task completion efficiency and generalization capabilities in previous unseen scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02682v1-abstract-full').style.display = 'none'; document.getElementById('2503.02682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00501">arXiv:2503.00501</a> <span> [<a href="https://arxiv.org/pdf/2503.00501">pdf</a>, <a href="https://arxiv.org/format/2503.00501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Qilin: A Multimodal Information Retrieval Dataset with APP-level User Sessions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jia Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaohui He</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yan Gao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shaosheng Cao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Ping Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00501v1-abstract-short" style="display: inline;"> User-generated content (UGC) communities, especially those featuring multimodal content, improve user experiences by integrating visual and textual information into results (or items). The challenge of improving user experiences in complex systems with search and recommendation (S\&R) services has drawn significant attention from both academia and industry these years. However, the lack of high-qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00501v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00501v1-abstract-full" style="display: none;"> User-generated content (UGC) communities, especially those featuring multimodal content, improve user experiences by integrating visual and textual information into results (or items). The challenge of improving user experiences in complex systems with search and recommendation (S\&R) services has drawn significant attention from both academia and industry these years. However, the lack of high-quality datasets has limited the research progress on multimodal S\&R. To address the growing need for developing better S\&R services, we present a novel multimodal information retrieval dataset in this paper, namely Qilin. The dataset is collected from Xiaohongshu, a popular social platform with over 300 million monthly active users and an average search penetration rate of over 70\%. In contrast to existing datasets, \textsf{Qilin} offers a comprehensive collection of user sessions with heterogeneous results like image-text notes, video notes, commercial notes, and direct answers, facilitating the development of advanced multimodal neural retrieval models across diverse task settings. To better model user satisfaction and support the analysis of heterogeneous user behaviors, we also collect extensive APP-level contextual signals and genuine user feedback. Notably, Qilin contains user-favored answers and their referred results for search requests triggering the Deep Query Answering (DQA) module. This allows not only the training \& evaluation of a Retrieval-augmented Generation (RAG) pipeline, but also the exploration of how such a module would affect users' search behavior. Through comprehensive analysis and experiments, we provide interesting findings and insights for further improving S\&R systems. We hope that \textsf{Qilin} will significantly contribute to the advancement of multimodal content platforms with S\&R services in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00501v1-abstract-full').style.display = 'none'; document.getElementById('2503.00501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20707">arXiv:2502.20707</a> <span> [<a href="https://arxiv.org/pdf/2502.20707">pdf</a>, <a href="https://arxiv.org/format/2502.20707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIM.2025.3547488">10.1109/TIM.2025.3547488 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FSMP: A Frontier-Sampling-Mixed Planner for Fast Autonomous Exploration of Complex and Large 3-D Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiyong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuebo Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qianli Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haobo Xi</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20707v1-abstract-short" style="display: inline;"> In this paper, we propose a systematic framework for fast exploration of complex and large 3-D environments using micro aerial vehicles (MAVs). The key insight is the organic integration of the frontier-based and sampling-based strategies that can achieve rapid global exploration of the environment. Specifically, a field-of-view-based (FOV) frontier detector with the guarantee of completeness and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20707v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20707v1-abstract-full" style="display: none;"> In this paper, we propose a systematic framework for fast exploration of complex and large 3-D environments using micro aerial vehicles (MAVs). The key insight is the organic integration of the frontier-based and sampling-based strategies that can achieve rapid global exploration of the environment. Specifically, a field-of-view-based (FOV) frontier detector with the guarantee of completeness and soundness is devised for identifying 3-D map frontiers. Different from random sampling-based methods, the deterministic sampling technique is employed to build and maintain an incremental road map based on the recorded sensor FOVs and newly detected frontiers. With the resulting road map, we propose a two-stage path planner. First, it quickly computes the global optimal exploration path on the road map using the lazy evaluation strategy. Then, the best exploration path is smoothed for further improving the exploration efficiency. We validate the proposed method both in simulation and real-world experiments. The comparative results demonstrate the promising performance of our planner in terms of exploration efficiency, computational time, and explored volume. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20707v1-abstract-full').style.display = 'none'; document.getElementById('2502.20707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13pages, 12 figures, accepted by IEEE Transactions on Instrumentation and Measurement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14642">arXiv:2502.14642</a> <span> [<a href="https://arxiv.org/pdf/2502.14642">pdf</a>, <a href="https://arxiv.org/format/2502.14642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How Far are LLMs from Being Our Digital Twins? A Benchmark for Persona-Based Behavior Chain Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinfeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+L">Lei Sha</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14642v1-abstract-short" style="display: inline;"> Recently, LLMs have garnered increasing attention across academic disciplines for their potential as human digital twins, virtual proxies designed to replicate individuals and autonomously perform tasks such as decision-making, problem-solving, and reasoning on their behalf. However, current evaluations of LLMs primarily emphasize dialogue simulation while overlooking human behavior simulation, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14642v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14642v1-abstract-full" style="display: none;"> Recently, LLMs have garnered increasing attention across academic disciplines for their potential as human digital twins, virtual proxies designed to replicate individuals and autonomously perform tasks such as decision-making, problem-solving, and reasoning on their behalf. However, current evaluations of LLMs primarily emphasize dialogue simulation while overlooking human behavior simulation, which is crucial for digital twins. To address this gap, we introduce BehaviorChain, the first benchmark for evaluating LLMs' ability to simulate continuous human behavior. BehaviorChain comprises diverse, high-quality, persona-based behavior chains, totaling 15,846 distinct behaviors across 1,001 unique personas, each with detailed history and profile metadata. For evaluation, we integrate persona metadata into LLMs and employ them to iteratively infer contextually appropriate behaviors within dynamic scenarios provided by BehaviorChain. Comprehensive evaluation results demonstrated that even state-of-the-art models struggle with accurately simulating continuous human behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14642v1-abstract-full').style.display = 'none'; document.getElementById('2502.14642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13925">arXiv:2502.13925</a> <span> [<a href="https://arxiv.org/pdf/2502.13925">pdf</a>, <a href="https://arxiv.org/format/2502.13925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Single Frames: Can LMMs Comprehend Temporal and Contextual Narratives in Image Sequences? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaochen Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jialin Song</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+L">Longyu Guan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yixin Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weiyao Luo</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yifan Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiru Wang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangdi Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13925v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have achieved remarkable success across various visual-language tasks. However, existing benchmarks predominantly focus on single-image understanding, leaving the analysis of image sequences largely unexplored. To address this limitation, we introduce StripCipher, a comprehensive benchmark designed to evaluate capabilities of LMMs to comprehend and reason over sequen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13925v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13925v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have achieved remarkable success across various visual-language tasks. However, existing benchmarks predominantly focus on single-image understanding, leaving the analysis of image sequences largely unexplored. To address this limitation, we introduce StripCipher, a comprehensive benchmark designed to evaluate capabilities of LMMs to comprehend and reason over sequential images. StripCipher comprises a human-annotated dataset and three challenging subtasks: visual narrative comprehension, contextual frame prediction, and temporal narrative reordering. Our evaluation of $16$ state-of-the-art LMMs, including GPT-4o and Qwen2.5VL, reveals a significant performance gap compared to human capabilities, particularly in tasks that require reordering shuffled sequential images. For instance, GPT-4o achieves only 23.93% accuracy in the reordering subtask, which is 56.07% lower than human performance. Further quantitative analysis discuss several factors, such as input format of images, affecting the performance of LLMs in sequential understanding, underscoring the fundamental challenges that remain in the development of LMMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13925v1-abstract-full').style.display = 'none'; document.getElementById('2502.13925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08958">arXiv:2502.08958</a> <span> [<a href="https://arxiv.org/pdf/2502.08958">pdf</a>, <a href="https://arxiv.org/format/2502.08958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Biologically Plausible Brain Graph Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Ciyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuelong Huang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qichao Dong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yaochu Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08958v1-abstract-short" style="display: inline;"> State-of-the-art brain graph analysis methods fail to fully encode the small-world architecture of brain graphs (accompanied by the presence of hubs and functional modules), and therefore lack biological plausibility to some extent. This limitation hinders their ability to accurately represent the brain's structural and functional properties, thereby restricting the effectiveness of machine learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08958v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08958v1-abstract-full" style="display: none;"> State-of-the-art brain graph analysis methods fail to fully encode the small-world architecture of brain graphs (accompanied by the presence of hubs and functional modules), and therefore lack biological plausibility to some extent. This limitation hinders their ability to accurately represent the brain's structural and functional properties, thereby restricting the effectiveness of machine learning models in tasks such as brain disorder detection. In this work, we propose a novel Biologically Plausible Brain Graph Transformer (BioBGT) that encodes the small-world architecture inherent in brain graphs. Specifically, we present a network entanglement-based node importance encoding technique that captures the structural importance of nodes in global information propagation during brain graph communication, highlighting the biological properties of the brain structure. Furthermore, we introduce a functional module-aware self-attention to preserve the functional segregation and integration characteristics of brain graphs in the learned representations. Experimental results on three benchmark datasets demonstrate that BioBGT outperforms state-of-the-art models, enhancing biologically plausible brain graph representations for various brain graph analytical tasks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08958v1-abstract-full').style.display = 'none'; document.getElementById('2502.08958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27pages, 16figures, published as a conference paper at ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10775">arXiv:2501.10775</a> <span> [<a href="https://arxiv.org/pdf/2501.10775">pdf</a>, <a href="https://arxiv.org/format/2501.10775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MedFILIP: Medical Fine-grained Language-Image Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinjie Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fanding Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qing Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuanquan Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+S">Suyu Dong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gongning Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10775v1-abstract-short" style="display: inline;"> Medical vision-language pretraining (VLP) that leverages naturally-paired medical image-report data is crucial for medical image analysis. However, existing methods struggle to accurately characterize associations between images and diseases, leading to inaccurate or incomplete diagnostic results. In this work, we propose MedFILIP, a fine-grained VLP model, introduces medical image-specific knowle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10775v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10775v1-abstract-full" style="display: none;"> Medical vision-language pretraining (VLP) that leverages naturally-paired medical image-report data is crucial for medical image analysis. However, existing methods struggle to accurately characterize associations between images and diseases, leading to inaccurate or incomplete diagnostic results. In this work, we propose MedFILIP, a fine-grained VLP model, introduces medical image-specific knowledge through contrastive learning, specifically: 1) An information extractor based on a large language model is proposed to decouple comprehensive disease details from reports, which excels in extracting disease deals through flexible prompt engineering, thereby effectively reducing text complexity while retaining rich information at a tiny cost. 2) A knowledge injector is proposed to construct relationships between categories and visual attributes, which help the model to make judgments based on image features, and fosters knowledge extrapolation to unfamiliar disease categories. 3) A semantic similarity matrix based on fine-grained annotations is proposed, providing smoother, information-richer labels, thus allowing fine-grained image-text alignment. 4) We validate MedFILIP on numerous datasets, e.g., RSNA-Pneumonia, NIH ChestX-ray14, VinBigData, and COVID-19. For single-label, multi-label, and fine-grained classification, our model achieves state-of-the-art performance, the classification accuracy has increased by a maximum of 6.69\%. The code is available in https://github.com/PerceptionComputingLab/MedFILIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10775v1-abstract-full').style.display = 'none'; document.getElementById('2501.10775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures, IEEE Journal of Biomedical and Health Informatics 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07825">arXiv:2501.07825</a> <span> [<a href="https://arxiv.org/pdf/2501.07825">pdf</a>, <a href="https://arxiv.org/format/2501.07825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> An Efficient Sparse Hardware Accelerator for Spike-Driven Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengke Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Wendong Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiwei Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07825v1-abstract-short" style="display: inline;"> Recently, large models, such as Vision Transformer and BERT, have garnered significant attention due to their exceptional performance. However, their extensive computational requirements lead to considerable power and hardware resource consumption. Brain-inspired computing, characterized by its spike-driven methods, has emerged as a promising approach for low-power hardware implementation. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07825v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07825v1-abstract-full" style="display: none;"> Recently, large models, such as Vision Transformer and BERT, have garnered significant attention due to their exceptional performance. However, their extensive computational requirements lead to considerable power and hardware resource consumption. Brain-inspired computing, characterized by its spike-driven methods, has emerged as a promising approach for low-power hardware implementation. In this paper, we propose an efficient sparse hardware accelerator for Spike-driven Transformer. We first design a novel encoding method that encodes the position information of valid activations and skips non-spike values. This method enables us to use encoded spikes for executing the calculations of linear, maxpooling and spike-driven self-attention. Compared with the single spike input design of conventional SNN accelerators that primarily focus on convolution-based spiking computations, the specialized module for spike-driven self-attention is unique in its ability to handle dual spike inputs. By exclusively utilizing activated spikes, our design fully exploits the sparsity of Spike-driven Transformer, which diminishes redundant operations, lowers power consumption, and minimizes computational latency. Experimental results indicate that compared to existing SNNs accelerators, our design achieves up to 13.24$\times$ and 1.33$\times$ improvements in terms of throughput and energy efficiency, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07825v1-abstract-full').style.display = 'none'; document.getElementById('2501.07825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18619">arXiv:2412.18619</a> <span> [<a href="https://arxiv.org/pdf/2412.18619">pdf</a>, <a href="https://arxiv.org/format/2412.18619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Next Token Prediction Towards Multimodal Intelligence: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Shuhuai Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongcheng Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruoyu Wu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulong Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shuai Bai</a>, <a href="/search/cs?searchtype=author&query=Vlachos%2C+A">Andreas Vlachos</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&query=Yee%2C+A">Aaron Yee</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18619v2-abstract-short" style="display: inline;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18619v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18619v2-abstract-full" style="display: none;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks from different modalities can also be effectively encapsulated within the NTP framework, transforming the multimodal information into tokens and predict the next one given the context. This survey introduces a comprehensive taxonomy that unifies both understanding and generation within multimodal learning through the lens of NTP. The proposed taxonomy covers five key aspects: Multimodal tokenization, MMNTP model architectures, unified task representation, datasets \& evaluation, and open challenges. This new taxonomy aims to aid researchers in their exploration of multimodal intelligence. An associated GitHub repository collecting the latest papers and repos is available at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'none'; document.getElementById('2412.18619v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">69 papes, 18 figures, repo at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16895">arXiv:2412.16895</a> <span> [<a href="https://arxiv.org/pdf/2412.16895">pdf</a>, <a href="https://arxiv.org/format/2412.16895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Dataset Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Muquan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiang Dong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiurui Xie</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+K">Ke Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16895v1-abstract-short" style="display: inline;"> Contemporary deep learning, characterized by the training of cumbersome neural networks on massive datasets, confronts substantial computational hurdles. To alleviate heavy data storage burdens on limited hardware resources, numerous dataset compression methods such as dataset distillation (DD) and coreset selection have emerged to obtain a compact but informative dataset through synthesis or sele… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16895v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16895v1-abstract-full" style="display: none;"> Contemporary deep learning, characterized by the training of cumbersome neural networks on massive datasets, confronts substantial computational hurdles. To alleviate heavy data storage burdens on limited hardware resources, numerous dataset compression methods such as dataset distillation (DD) and coreset selection have emerged to obtain a compact but informative dataset through synthesis or selection for efficient training. However, DD involves an expensive optimization procedure and exhibits limited generalization across unseen architectures, while coreset selection is limited by its low data keep ratio and reliance on heuristics, hindering its practicality and feasibility. To address these limitations, we introduce a newly versatile framework for dataset compression, namely Adaptive Dataset Quantization (ADQ). Specifically, we first identify the sub-optimal performance of naive Dataset Quantization (DQ), which relies on uniform sampling and overlooks the varying importance of each generated bin. Subsequently, we propose a novel adaptive sampling strategy through the evaluation of generated bins' representativeness score, diversity score and importance score, where the former two scores are quantified by the texture level and contrastive learning-based techniques, respectively. Extensive experiments demonstrate that our method not only exhibits superior generalization capability across different architectures, but also attains state-of-the-art results, surpassing DQ by average 3\% on various datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16895v1-abstract-full').style.display = 'none'; document.getElementById('2412.16895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05579">arXiv:2412.05579</a> <span> [<a href="https://arxiv.org/pdf/2412.05579">pdf</a>, <a href="https://arxiv.org/format/2412.05579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junjie Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Huixue Su</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yujia Zhou</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Ziyi Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05579v2-abstract-short" style="display: inline;"> The rapid advancement of Large Language Models (LLMs) has driven their expanding application across various fields. One of the most promising applications is their role as evaluators based on natural language responses, referred to as ''LLMs-as-judges''. This framework has attracted growing attention from both academia and industry due to their excellent effectiveness, ability to generalize across… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05579v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05579v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05579v2-abstract-full" style="display: none;"> The rapid advancement of Large Language Models (LLMs) has driven their expanding application across various fields. One of the most promising applications is their role as evaluators based on natural language responses, referred to as ''LLMs-as-judges''. This framework has attracted growing attention from both academia and industry due to their excellent effectiveness, ability to generalize across tasks, and interpretability in the form of natural language. This paper presents a comprehensive survey of the LLMs-as-judges paradigm from five key perspectives: Functionality, Methodology, Applications, Meta-evaluation, and Limitations. We begin by providing a systematic definition of LLMs-as-Judges and introduce their functionality (Why use LLM judges?). Then we address methodology to construct an evaluation system with LLMs (How to use LLM judges?). Additionally, we investigate the potential domains for their application (Where to use LLM judges?) and discuss methods for evaluating them in various contexts (How to evaluate LLM judges?). Finally, we provide a detailed analysis of the limitations of LLM judges and discuss potential future directions. Through a structured and comprehensive analysis, we aim aims to provide insights on the development and application of LLMs-as-judges in both research and practice. We will continue to maintain the relevant resource list at https://github.com/CSHaitao/Awesome-LLMs-as-Judges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05579v2-abstract-full').style.display = 'none'; document.getElementById('2412.05579v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 pages, comprehensive and continuously updated</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17745">arXiv:2411.17745</a> <span> [<a href="https://arxiv.org/pdf/2411.17745">pdf</a>, <a href="https://arxiv.org/format/2411.17745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Parameter Adaptive Trajectory Tracking and Motion Control Framework for Autonomous Vehicle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiarui Song</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yingbo Sun</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qing Dong</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xuewu Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17745v1-abstract-short" style="display: inline;"> This paper studies the trajectory tracking and motion control problems for autonomous vehicles (AVs). A parameter adaptive control framework for AVs is proposed to enhance tracking accuracy and yaw stability. While establishing linear quadratic regulator (LQR) and three robust controllers, the control framework addresses trajectory tracking and motion control in a modular fashion, without introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17745v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17745v1-abstract-full" style="display: none;"> This paper studies the trajectory tracking and motion control problems for autonomous vehicles (AVs). A parameter adaptive control framework for AVs is proposed to enhance tracking accuracy and yaw stability. While establishing linear quadratic regulator (LQR) and three robust controllers, the control framework addresses trajectory tracking and motion control in a modular fashion, without introducing complexity into each controller. The robust performance has been guaranteed in three robust controllers by considering the parameter uncertainties, mismatch of unmodeled subsystem as well as external disturbance, comprehensively. Also, the dynamic characteristics of uncertain parameters are identified by Recursive Least Squares (RLS) algorithm, while the boundaries of three robust factors are determined through combining Gaussian Process Regression (GPR) and Bayesian optimization machine learning methods, reducing the conservatism of the controller. Sufficient conditions for closed-loop stability under the diverse robust factors are provided by the Lyapunov method analytically. The simulation results on MATLAB/Simulink and Carsim joint platform demonstrate that the proposed methodology considerably improves tracking accuracy, driving stability, and robust performance, guaranteeing the feasibility and capability of driving in extreme scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17745v1-abstract-full').style.display = 'none'; document.getElementById('2411.17745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05088">arXiv:2411.05088</a> <span> [<a href="https://arxiv.org/pdf/2411.05088">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Findings of the IWSLT 2024 Evaluation Campaign </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahmad%2C+I+S">Ibrahim Said Ahmad</a>, <a href="/search/cs?searchtype=author&query=Anastasopoulos%2C+A">Antonios Anastasopoulos</a>, <a href="/search/cs?searchtype=author&query=Bojar%2C+O">Ond艡ej Bojar</a>, <a href="/search/cs?searchtype=author&query=Borg%2C+C">Claudia Borg</a>, <a href="/search/cs?searchtype=author&query=Carpuat%2C+M">Marine Carpuat</a>, <a href="/search/cs?searchtype=author&query=Cattoni%2C+R">Roldano Cattoni</a>, <a href="/search/cs?searchtype=author&query=Cettolo%2C+M">Mauro Cettolo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/cs?searchtype=author&query=Federico%2C+M">Marcello Federico</a>, <a href="/search/cs?searchtype=author&query=Haddow%2C+B">Barry Haddow</a>, <a href="/search/cs?searchtype=author&query=Javorsk%C3%BD%2C+D">D谩vid Javorsk媒</a>, <a href="/search/cs?searchtype=author&query=Krubi%C5%84ski%2C+M">Mateusz Krubi艅ski</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+T+K">Tsz Kin Lam</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xutai Ma</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+P">Prashant Mathur</a>, <a href="/search/cs?searchtype=author&query=Matusov%2C+E">Evgeny Matusov</a>, <a href="/search/cs?searchtype=author&query=Maurya%2C+C">Chandresh Maurya</a>, <a href="/search/cs?searchtype=author&query=McCrae%2C+J">John McCrae</a>, <a href="/search/cs?searchtype=author&query=Murray%2C+K">Kenton Murray</a>, <a href="/search/cs?searchtype=author&query=Nakamura%2C+S">Satoshi Nakamura</a>, <a href="/search/cs?searchtype=author&query=Negri%2C+M">Matteo Negri</a>, <a href="/search/cs?searchtype=author&query=Niehues%2C+J">Jan Niehues</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+X">Xing Niu</a>, <a href="/search/cs?searchtype=author&query=Ojha%2C+A+K">Atul Kr. Ojha</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05088v1-abstract-short" style="display: inline;"> This paper reports on the shared tasks organized by the 21st IWSLT Conference. The shared tasks address 7 scientific challenges in spoken language translation: simultaneous and offline translation, automatic subtitling and dubbing, speech-to-speech translation, dialect and low-resource speech translation, and Indic languages. The shared tasks attracted 18 teams whose submissions are documented in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05088v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05088v1-abstract-full" style="display: none;"> This paper reports on the shared tasks organized by the 21st IWSLT Conference. The shared tasks address 7 scientific challenges in spoken language translation: simultaneous and offline translation, automatic subtitling and dubbing, speech-to-speech translation, dialect and low-resource speech translation, and Indic languages. The shared tasks attracted 18 teams whose submissions are documented in 26 system papers. The growing interest towards spoken language translation is also witnessed by the constantly increasing number of shared task organizers and contributors to the overview paper, almost evenly distributed across industry and academia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05088v1-abstract-full').style.display = 'none'; document.getElementById('2411.05088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IWSLT 2024; 59 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16566">arXiv:2410.16566</a> <span> [<a href="https://arxiv.org/pdf/2410.16566">pdf</a>, <a href="https://arxiv.org/format/2410.16566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Revenue vs. Welfare: A Comprehensive Analysis of Strategic Trade-offs in Online Food Delivery Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yukun Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qi Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16566v2-abstract-short" style="display: inline;"> This paper investigates the trade-off between short-term revenue generation and long-term social welfare optimization in online food delivery platforms. We first develop a static model that captures the equilibrium interactions among restaurants, consumers, and delivery workers, using Gross Merchandise Value (GMV) as a proxy for immediate performance. Building on this, we extend our analysis to a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16566v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16566v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16566v2-abstract-full" style="display: none;"> This paper investigates the trade-off between short-term revenue generation and long-term social welfare optimization in online food delivery platforms. We first develop a static model that captures the equilibrium interactions among restaurants, consumers, and delivery workers, using Gross Merchandise Value (GMV) as a proxy for immediate performance. Building on this, we extend our analysis to a dynamic model that integrates evolving state variables,such as platform reputation and participant retention-to capture long-term behavior. By applying dynamic programming techniques, we derive optimal strategies that balance GMV maximization with social welfare enhancement. Extensive multi-agent simulations validate our theoretical predictions, demonstrating that while a GMV-focused approach yields strong initial gains, it ultimately undermines long-term stability. In contrast, a social welfare-oriented strategy produces more sustainable and robust outcomes. Our findings provide actionable insights for platform operators and policymakers seeking to harmonize rapid growth with long-term <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16566v2-abstract-full').style.display = 'none'; document.getElementById('2410.16566v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15393">arXiv:2410.15393</a> <span> [<a href="https://arxiv.org/pdf/2410.15393">pdf</a>, <a href="https://arxiv.org/format/2410.15393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CalibraEval: Calibrating Prediction Distribution to Mitigate Selection Bias in LLMs-as-Judges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junjie Chen</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhumin Chu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yujia Zhou</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15393v1-abstract-short" style="display: inline;"> The use of large language models (LLMs) as automated evaluation tools to assess the quality of generated natural language, known as LLMs-as-Judges, has demonstrated promising capabilities and is rapidly gaining widespread attention. However, when applied to pairwise comparisons of candidate responses, LLM-based evaluators often exhibit selection bias. Specifically, their judgments may become incon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15393v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15393v1-abstract-full" style="display: none;"> The use of large language models (LLMs) as automated evaluation tools to assess the quality of generated natural language, known as LLMs-as-Judges, has demonstrated promising capabilities and is rapidly gaining widespread attention. However, when applied to pairwise comparisons of candidate responses, LLM-based evaluators often exhibit selection bias. Specifically, their judgments may become inconsistent when the option positions or ID tokens are swapped, compromising the effectiveness and fairness of the evaluation result. To address this challenge, we introduce CalibraEval, a novel label-free method for mitigating selection bias during inference. Specifically, CalibraEval reformulates debiasing as an optimization task aimed at adjusting observed prediction distributions to align with unbiased prediction distributions. To solve this optimization problem, we propose a non-parametric order-preserving algorithm (NOA). This algorithm leverages the partial order relationships between model prediction distributions, thereby eliminating the need for explicit labels and precise mathematical function modeling.Empirical evaluations of LLMs in multiple representative benchmarks demonstrate that CalibraEval effectively mitigates selection bias and improves performance compared to existing debiasing methods. This work marks a step toward building more robust and unbiased automated evaluation frameworks, paving the way for improved reliability in AI-driven assessments <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15393v1-abstract-full').style.display = 'none'; document.getElementById('2410.15393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13090">arXiv:2410.13090</a> <span> [<a href="https://arxiv.org/pdf/2410.13090">pdf</a>, <a href="https://arxiv.org/format/2410.13090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Head Effect in Live Streaming Platforms: A Two-Sided Market and Welfare Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yukun Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qi Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13090v2-abstract-short" style="display: inline;"> We develop a comprehensive theoretical framework to analyze live streaming platforms as two-sided markets, focusing on the head effect where a small subset of elite streamers disproportionately attracts viewer attention. By constructing both static and dynamic models, we capture the interplay between network effects, content quality investments, and platform policies-such as commission structures… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13090v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13090v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13090v2-abstract-full" style="display: none;"> We develop a comprehensive theoretical framework to analyze live streaming platforms as two-sided markets, focusing on the head effect where a small subset of elite streamers disproportionately attracts viewer attention. By constructing both static and dynamic models, we capture the interplay between network effects, content quality investments, and platform policies-such as commission structures and traffic allocation algorithms-that drive traffic concentration. Our welfare analysis demonstrates that although short-term consumer utility may benefit from concentrated viewership, long-term content diversity and overall social welfare are adversely impacted. Extensive simulations further validate our models and show that targeted policy interventions can rebalance viewer distribution and mitigate winner-takes-all dynamics. These findings offer actionable insights for platform designers and regulators in the digital economy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13090v2-abstract-full').style.display = 'none'; document.getElementById('2410.13090v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07985">arXiv:2410.07985</a> <span> [<a href="https://arxiv.org/pdf/2410.07985">pdf</a>, <a href="https://arxiv.org/format/2410.07985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Omni-MATH: A Universal Olympiad Level Mathematic Benchmark For Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+F">Feifan Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenghao Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Zan%2C+D">Daoguang Zan</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+L">Lei Sha</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xuancheng Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07985v3-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have led to significant breakthroughs in mathematical reasoning capabilities. However, existing benchmarks like GSM8K or MATH are now being solved with high accuracy (e.g., OpenAI o1 achieves 94.8\% on MATH dataset), indicating their inadequacy for truly challenging these models. To bridge this gap, we propose a comprehensive and challenging benc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07985v3-abstract-full').style.display = 'inline'; document.getElementById('2410.07985v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07985v3-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have led to significant breakthroughs in mathematical reasoning capabilities. However, existing benchmarks like GSM8K or MATH are now being solved with high accuracy (e.g., OpenAI o1 achieves 94.8\% on MATH dataset), indicating their inadequacy for truly challenging these models. To bridge this gap, we propose a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical reasoning at the Olympiad level. Unlike existing Olympiad-related benchmarks, our dataset focuses exclusively on mathematics and comprises a vast collection of 4428 competition-level problems with rigorous human annotation. These problems are meticulously categorized into over 33 sub-domains and span more than 10 distinct difficulty levels, enabling a holistic assessment of model performance in Olympiad-mathematical reasoning. Furthermore, we conducted an in-depth analysis based on this benchmark. Our experimental results show that even the most advanced models, OpenAI o1-mini and OpenAI o1-preview, struggle with highly challenging Olympiad-level problems, with 60.54\% and 52.55\% accuracy, highlighting significant challenges in Olympiad-level mathematical reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07985v3-abstract-full').style.display = 'none'; document.getElementById('2410.07985v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07064">arXiv:2410.07064</a> <span> [<a href="https://arxiv.org/pdf/2410.07064">pdf</a>, <a href="https://arxiv.org/format/2410.07064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Data Selection via Optimal Control for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuxian Gu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yaru Hao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Furu Wei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07064v2-abstract-short" style="display: inline;"> This work investigates the selection of high-quality pre-training data from massive corpora to enhance LMs' capabilities for downstream usage. We formulate data selection as a generalized Optimal Control problem, which can be solved theoretically by Pontryagin's Maximum Principle (PMP), yielding a set of necessary conditions that characterize the relationship between optimal data selection and LM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07064v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07064v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07064v2-abstract-full" style="display: none;"> This work investigates the selection of high-quality pre-training data from massive corpora to enhance LMs' capabilities for downstream usage. We formulate data selection as a generalized Optimal Control problem, which can be solved theoretically by Pontryagin's Maximum Principle (PMP), yielding a set of necessary conditions that characterize the relationship between optimal data selection and LM training dynamics. Based on these theoretical results, we introduce PMP-based Data Selection (PDS), a framework that approximates optimal data selection by solving the PMP conditions. In our experiments, we adopt PDS to select data from CommmonCrawl and show that the PDS-selected corpus accelerates the learning of LMs and constantly boosts their performance on a wide range of downstream tasks across various model sizes. Moreover, the benefits of PDS extend to ~400B models trained on ~10T tokens, as evidenced by the extrapolation of the test loss curves according to the Scaling Laws. PDS also improves data utilization when the pre-training data is limited, by reducing the data demand by 1.8 times, which helps mitigate the quick exhaustion of available web-crawled corpora. Our code, model, and data can be found at https://github.com/microsoft/LMOps/tree/main/data_selection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07064v2-abstract-full').style.display = 'none'; document.getElementById('2410.07064v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025 Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06961">arXiv:2410.06961</a> <span> [<a href="https://arxiv.org/pdf/2410.06961">pdf</a>, <a href="https://arxiv.org/format/2410.06961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-Boosting Large Language Models with Synthetic Preference Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06961v1-abstract-short" style="display: inline;"> Through alignment with human preferences, Large Language Models (LLMs) have advanced significantly in generating honest, harmless, and helpful responses. However, collecting high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs. We introduce SynPO, a self-boosting paradigm that leverages synthetic preference data for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06961v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06961v1-abstract-full" style="display: none;"> Through alignment with human preferences, Large Language Models (LLMs) have advanced significantly in generating honest, harmless, and helpful responses. However, collecting high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs. We introduce SynPO, a self-boosting paradigm that leverages synthetic preference data for model alignment. SynPO employs an iterative mechanism wherein a self-prompt generator creates diverse prompts, and a response improver refines model responses progressively. This approach trains LLMs to autonomously learn the generative rewards for their own outputs and eliminates the need for large-scale annotation of prompts and human preferences. After four SynPO iterations, Llama3-8B and Mistral-7B show significant enhancements in instruction-following abilities, achieving over 22.1% win rate improvements on AlpacaEval 2.0 and ArenaHard. Simultaneously, SynPO improves the general performance of LLMs on various tasks, validated by a 3.2 to 5.0 average score increase on the well-recognized Open LLM leaderboard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06961v1-abstract-full').style.display = 'none'; document.getElementById('2410.06961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17665">arXiv:2409.17665</a> <span> [<a href="https://arxiv.org/pdf/2409.17665">pdf</a>, <a href="https://arxiv.org/format/2409.17665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Novel Improved Beluga Whale Optimization Algorithm for Solving Localization Problem in Swarm Robotic Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Teng%2C+Z">Zuhao Teng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17665v1-abstract-short" style="display: inline;"> In Swarm Robotic Systems (SRSs), only a few robots are equipped with Global Positioning System (GPS) devices, known as anchors. A challenge lies in inferring the positions of other unknown robots based on the positions of anchors. Existing solutions estimate their positions using distance measurements between unknown robots and anchors. Based on existing solutions, this study proposes a novel meta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17665v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17665v1-abstract-full" style="display: none;"> In Swarm Robotic Systems (SRSs), only a few robots are equipped with Global Positioning System (GPS) devices, known as anchors. A challenge lies in inferring the positions of other unknown robots based on the positions of anchors. Existing solutions estimate their positions using distance measurements between unknown robots and anchors. Based on existing solutions, this study proposes a novel meta-heuristic algorithm - Improved Beluga Whale Optimization Algorithm (IBWO) to address the localization problem of SRSs, focusing on enhancing the accuracy of localization results. Simulation results demonstrate the effectiveness of this study. Specifically, we test the localization accuracy of robots under different proportions of anchors, different communication radius of robots, and different total number of robots. Compared to the traditional multilateration method and four other localization methods based on meta-heuristic algorithms, the localization accuracy of this method is consistently superior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17665v1-abstract-full').style.display = 'none'; document.getElementById('2409.17665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13325">arXiv:2409.13325</a> <span> [<a href="https://arxiv.org/pdf/2409.13325">pdf</a>, <a href="https://arxiv.org/format/2409.13325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Semi-supervised Dual-modal Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiulei Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianan Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shuang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13325v1-abstract-short" style="display: inline;"> With the development of 3D and 2D data acquisition techniques, it has become easy to obtain point clouds and images of scenes simultaneously, which further facilitates dual-modal semantic segmentation. Most existing methods for simultaneously segmenting point clouds and images rely heavily on the quantity and quality of the labeled training data. However, massive point-wise and pixel-wise labeling… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13325v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13325v1-abstract-full" style="display: none;"> With the development of 3D and 2D data acquisition techniques, it has become easy to obtain point clouds and images of scenes simultaneously, which further facilitates dual-modal semantic segmentation. Most existing methods for simultaneously segmenting point clouds and images rely heavily on the quantity and quality of the labeled training data. However, massive point-wise and pixel-wise labeling procedures are time-consuming and labor-intensive. To address this issue, we propose a parallel dual-stream network to handle the semi-supervised dual-modal semantic segmentation task, called PD-Net, by jointly utilizing a small number of labeled point clouds, a large number of unlabeled point clouds, and unlabeled images. The proposed PD-Net consists of two parallel streams (called original stream and pseudo-label prediction stream). The pseudo-label prediction stream predicts the pseudo labels of unlabeled point clouds and their corresponding images. Then, the unlabeled data is sent to the original stream for self-training. Each stream contains two encoder-decoder branches for 3D and 2D data respectively. In each stream, multiple dual-modal fusion modules are explored for fusing the dual-modal features. In addition, a pseudo-label optimization module is explored to optimize the pseudo labels output by the pseudo-label prediction stream. Experimental results on two public datasets demonstrate that the proposed PD-Net not only outperforms the comparative semi-supervised methods but also achieves competitive performances with some fully-supervised methods in most cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13325v1-abstract-full').style.display = 'none'; document.getElementById('2409.13325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12533">arXiv:2409.12533</a> <span> [<a href="https://arxiv.org/pdf/2409.12533">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MambaClinix: Hierarchical Gated Convolution and Mamba-Based U-Net for Enhanced 3D Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bian%2C+C">Chenyuan Bian</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+N">Nan Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xia Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feifei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengjiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+B">Bin Wei</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12533v1-abstract-short" style="display: inline;"> Deep learning, particularly convolutional neural networks (CNNs) and Transformers, has significantly advanced 3D medical image segmentation. While CNNs are highly effective at capturing local features, their limited receptive fields may hinder performance in complex clinical scenarios. In contrast, Transformers excel at modeling long-range dependencies but are computationally intensive, making the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12533v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12533v1-abstract-full" style="display: none;"> Deep learning, particularly convolutional neural networks (CNNs) and Transformers, has significantly advanced 3D medical image segmentation. While CNNs are highly effective at capturing local features, their limited receptive fields may hinder performance in complex clinical scenarios. In contrast, Transformers excel at modeling long-range dependencies but are computationally intensive, making them expensive to train and deploy. Recently, the Mamba architecture, based on the State Space Model (SSM), has been proposed to efficiently model long-range dependencies while maintaining linear computational complexity. However, its application in medical image segmentation reveals shortcomings, particularly in capturing critical local features essential for accurate delineation of clinical regions. In this study, we propose MambaClinix, a novel U-shaped architecture for medical image segmentation that integrates a hierarchical gated convolutional network(HGCN) with Mamba in an adaptive stage-wise framework. This design significantly enhances computational efficiency and high-order spatial interactions, enabling the model to effectively capture both proximal and distal relationships in medical images. Specifically, our HGCN is designed to mimic the attention mechanism of Transformers by a purely convolutional structure, facilitating high-order spatial interactions in feature maps while avoiding the computational complexity typically associated with Transformer-based methods. Additionally, we introduce a region-specific Tversky loss, which emphasizes specific pixel regions to improve auto-segmentation performance, thereby optimizing the model's decision-making process. Experimental results on five benchmark datasets demonstrate that the proposed MambaClinix achieves high segmentation accuracy while maintaining low model complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12533v1-abstract-full').style.display = 'none'; document.getElementById('2409.12533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02795">arXiv:2409.02795</a> <span> [<a href="https://arxiv.org/pdf/2409.02795">pdf</a>, <a href="https://arxiv.org/format/2409.02795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards a Unified View of Preference Learning for Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+F">Feifan Song</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Helan Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Ce Zheng</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zan%2C+D">Daoguang Zan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zeyu Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+L">Lei Sha</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Houfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02795v5-abstract-short" style="display: inline;"> Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of the crucial factors to achieve success is aligning the LLM's output with human preferences. This alignment process often requires only a small amount of data to efficiently enhance the LLM's performance. While effective, research in this area spans multiple domains, and the methods involved are relatively complex to unde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02795v5-abstract-full').style.display = 'inline'; document.getElementById('2409.02795v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02795v5-abstract-full" style="display: none;"> Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of the crucial factors to achieve success is aligning the LLM's output with human preferences. This alignment process often requires only a small amount of data to efficiently enhance the LLM's performance. While effective, research in this area spans multiple domains, and the methods involved are relatively complex to understand. The relationships between different methods have been under-explored, limiting the development of the preference alignment. In light of this, we break down the existing popular alignment strategies into different components and provide a unified framework to study the current alignment strategies, thereby establishing connections among them. In this survey, we decompose all the strategies in preference learning into four components: model, data, feedback, and algorithm. This unified view offers an in-depth understanding of existing alignment algorithms and also opens up possibilities to synergize the strengths of different strategies. Furthermore, we present detailed working examples of prevalent existing algorithms to facilitate a comprehensive understanding for the readers. Finally, based on our unified perspective, we explore the challenges and future research directions for aligning large language models with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02795v5-abstract-full').style.display = 'none'; document.getElementById('2409.02795v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16202">arXiv:2408.16202</a> <span> [<a href="https://arxiv.org/pdf/2408.16202">pdf</a>, <a href="https://arxiv.org/format/2408.16202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Short-Term Electricity-Load Forecasting by Deep Learning: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qi Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rubing Huang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Chenhui Cui</a>, <a href="/search/cs?searchtype=author&query=Towey%2C+D">Dave Towey</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Ling Zhou</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jinyu Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16202v1-abstract-short" style="display: inline;"> Short-Term Electricity-Load Forecasting (STELF) refers to the prediction of the immediate demand (in the next few hours to several days) for the power system. Various external factors, such as weather changes and the emergence of new electricity consumption scenarios, can impact electricity demand, causing load data to fluctuate and become non-linear, which increases the complexity and difficulty… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16202v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16202v1-abstract-full" style="display: none;"> Short-Term Electricity-Load Forecasting (STELF) refers to the prediction of the immediate demand (in the next few hours to several days) for the power system. Various external factors, such as weather changes and the emergence of new electricity consumption scenarios, can impact electricity demand, causing load data to fluctuate and become non-linear, which increases the complexity and difficulty of STELF. In the past decade, deep learning has been applied to STELF, modeling and predicting electricity demand with high accuracy, and contributing significantly to the development of STELF. This paper provides a comprehensive survey on deep-learning-based STELF over the past ten years. It examines the entire forecasting process, including data pre-processing, feature extraction, deep-learning modeling and optimization, and results evaluation. This paper also identifies some research challenges and potential research directions to be further investigated in future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16202v1-abstract-full').style.display = 'none'; document.getElementById('2408.16202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05808">arXiv:2408.05808</a> <span> [<a href="https://arxiv.org/pdf/2408.05808">pdf</a>, <a href="https://arxiv.org/format/2408.05808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/IROS58592.2024.10801613">10.1109/IROS58592.2024.10801613 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fast and Communication-Efficient Multi-UAV Exploration Via Voronoi Partition on Dynamic Topological Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qianli Dong</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haobo Xi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiyong Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Q">Qingchen Bi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuebo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05808v1-abstract-short" style="display: inline;"> Efficient data transmission and reasonable task allocation are important to improve multi-robot exploration efficiency. However, most communication data types typically contain redundant information and thus require massive communication volume. Moreover, exploration-oriented task allocation is far from trivial and becomes even more challenging for resource-limited unmanned aerial vehicles (UAVs).… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05808v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05808v1-abstract-full" style="display: none;"> Efficient data transmission and reasonable task allocation are important to improve multi-robot exploration efficiency. However, most communication data types typically contain redundant information and thus require massive communication volume. Moreover, exploration-oriented task allocation is far from trivial and becomes even more challenging for resource-limited unmanned aerial vehicles (UAVs). In this paper, we propose a fast and communication-efficient multi-UAV exploration method for exploring large environments. We first design a multi-robot dynamic topological graph (MR-DTG) consisting of nodes representing the explored and exploring regions and edges connecting nodes. Supported by MR-DTG, our method achieves efficient communication by only transferring the necessary information required by exploration planning. To further improve the exploration efficiency, a hierarchical multi-UAV exploration method is devised using MR-DTG. Specifically, the \emph{graph Voronoi partition} is used to allocate MR-DTG's nodes to the closest UAVs, considering the actual motion cost, thus achieving reasonable task allocation. To our knowledge, this is the first work to address multi-UAV exploration using \emph{graph Voronoi partition}. The proposed method is compared with a state-of-the-art method in simulations. The results show that the proposed method is able to reduce the exploration time and communication volume by up to 38.3\% and 95.5\%, respectively. Finally, the effectiveness of our method is validated in the real-world experiment with 6 UAVs. We will release the source code to benefit the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05808v1-abstract-full').style.display = 'none'; document.getElementById('2408.05808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, accepted by IEEE IROS2024, code see https://github.com/NKU-MobFly-Robotics/GVP-MREP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a> <span> [<a href="https://arxiv.org/pdf/2407.21783">pdf</a>, <a href="https://arxiv.org/format/2407.21783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Llama 3 Herd of Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grattafiori%2C+A">Aaron Grattafiori</a>, <a href="/search/cs?searchtype=author&query=Dubey%2C+A">Abhimanyu Dubey</a>, <a href="/search/cs?searchtype=author&query=Jauhri%2C+A">Abhinav Jauhri</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+A">Abhinav Pandey</a>, <a href="/search/cs?searchtype=author&query=Kadian%2C+A">Abhishek Kadian</a>, <a href="/search/cs?searchtype=author&query=Al-Dahle%2C+A">Ahmad Al-Dahle</a>, <a href="/search/cs?searchtype=author&query=Letman%2C+A">Aiesha Letman</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Akhil Mathur</a>, <a href="/search/cs?searchtype=author&query=Schelten%2C+A">Alan Schelten</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+A">Alex Vaughan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Amy Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Hartshorn%2C+A">Anthony Hartshorn</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Aobo Yang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+A">Archi Mitra</a>, <a href="/search/cs?searchtype=author&query=Sravankumar%2C+A">Archie Sravankumar</a>, <a href="/search/cs?searchtype=author&query=Korenev%2C+A">Artem Korenev</a>, <a href="/search/cs?searchtype=author&query=Hinsvark%2C+A">Arthur Hinsvark</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Arun Rao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aston Zhang</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+A">Aurelien Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Gregerson%2C+A">Austen Gregerson</a>, <a href="/search/cs?searchtype=author&query=Spataru%2C+A">Ava Spataru</a>, <a href="/search/cs?searchtype=author&query=Roziere%2C+B">Baptiste Roziere</a> , et al. (536 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21783v3-abstract-short" style="display: inline;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21783v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21783v3-abstract-full" style="display: none;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'none'; document.getElementById('2407.21783v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10427">arXiv:2407.10427</a> <span> [<a href="https://arxiv.org/pdf/2407.10427">pdf</a>, <a href="https://arxiv.org/format/2407.10427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transformer for Multitemporal Hyperspectral Image Unmixing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiankun Dong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xueshuo Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xia Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10427v1-abstract-short" style="display: inline;"> Multitemporal hyperspectral image unmixing (MTHU) holds significant importance in monitoring and analyzing the dynamic changes of surface. However, compared to single-temporal unmixing, the multitemporal approach demands comprehensive consideration of information across different phases, rendering it a greater challenge. To address this challenge, we propose the Multitemporal Hyperspectral Image U… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10427v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10427v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10427v1-abstract-full" style="display: none;"> Multitemporal hyperspectral image unmixing (MTHU) holds significant importance in monitoring and analyzing the dynamic changes of surface. However, compared to single-temporal unmixing, the multitemporal approach demands comprehensive consideration of information across different phases, rendering it a greater challenge. To address this challenge, we propose the Multitemporal Hyperspectral Image Unmixing Transformer (MUFormer), an end-to-end unsupervised deep learning model. To effectively perform multitemporal hyperspectral image unmixing, we introduce two key modules: the Global Awareness Module (GAM) and the Change Enhancement Module (CEM). The Global Awareness Module computes self-attention across all phases, facilitating global weight allocation. On the other hand, the Change Enhancement Module dynamically learns local temporal changes by comparing endmember changes between adjacent phases. The synergy between these modules allows for capturing semantic information regarding endmember and abundance changes, thereby enhancing the effectiveness of multitemporal hyperspectral image unmixing. We conducted experiments on one real dataset and two synthetic datasets, demonstrating that our model significantly enhances the effect of multitemporal hyperspectral image unmixing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10427v1-abstract-full').style.display = 'none'; document.getElementById('2407.10427v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04675">arXiv:2407.04675</a> <span> [<a href="https://arxiv.org/pdf/2407.04675">pdf</a>, <a href="https://arxiv.org/format/2407.04675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Ye Bai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingping Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jitong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chuang Ding</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Linhao Dong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yujiao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kepan Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lu Gao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Minglun Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Ting Han</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenchao Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinying Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+D">Deyu Hua</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lu Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mingkun Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Youjia Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jishuo Jin</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+F">Fanliu Kong</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Z">Zongwei Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a> , et al. (30 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04675v2-abstract-short" style="display: inline;"> Modern automatic speech recognition (ASR) model is required to accurately transcribe diverse speech signals (from different domains, languages, accents, etc) given the specific contextual information in various application scenarios. Classic end-to-end models fused with extra language models perform well, but mainly in data matching scenarios and are gradually approaching a bottleneck. In this wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04675v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04675v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04675v2-abstract-full" style="display: none;"> Modern automatic speech recognition (ASR) model is required to accurately transcribe diverse speech signals (from different domains, languages, accents, etc) given the specific contextual information in various application scenarios. Classic end-to-end models fused with extra language models perform well, but mainly in data matching scenarios and are gradually approaching a bottleneck. In this work, we introduce Seed-ASR, a large language model (LLM) based speech recognition model. Seed-ASR is developed based on the framework of audio conditioned LLM (AcLLM), leveraging the capabilities of LLMs by inputting continuous speech representations together with contextual information into the LLM. Through stage-wise large-scale training and the elicitation of context-aware capabilities in LLM, Seed-ASR demonstrates significant improvement over end-to-end models on comprehensive evaluation sets, including multiple domains, accents/dialects and languages. Additionally, Seed-ASR can be further deployed to support specific needs in various scenarios without requiring extra language models. Compared to recently released large ASR models, Seed-ASR achieves 10%-40% reduction in word (or character, for Chinese) error rates on Chinese and English public test sets, further demonstrating its powerful performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04675v2-abstract-full').style.display = 'none'; document.getElementById('2407.04675v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01213">arXiv:2407.01213</a> <span> [<a href="https://arxiv.org/pdf/2407.01213">pdf</a>, <a href="https://arxiv.org/format/2407.01213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> EMIF: Evidence-aware Multi-source Information Fusion Network for Explainable Fake News Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxing Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shiyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaozhen Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01213v1-abstract-short" style="display: inline;"> Extensive research on automatic fake news detection has been conducted due to the significant detrimental effects of fake news proliferation. Most existing approaches rely on a single source of evidence, such as comments or relevant news, to derive explanatory evidence for decision-making, demonstrating exceptional performance. However, their single evidence source suffers from two critical drawba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01213v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01213v1-abstract-full" style="display: none;"> Extensive research on automatic fake news detection has been conducted due to the significant detrimental effects of fake news proliferation. Most existing approaches rely on a single source of evidence, such as comments or relevant news, to derive explanatory evidence for decision-making, demonstrating exceptional performance. However, their single evidence source suffers from two critical drawbacks: (i) noise abundance, and (ii) resilience deficiency. Inspired by the natural process of fake news identification, we propose an Evidence-aware Multi-source Information Fusion (EMIF) network that jointly leverages user comments and relevant news to make precise decision and excavate reliable evidence. To accomplish this, we initially construct a co-attention network to capture general semantic conflicts between comments and original news. Meanwhile, a divergence selection module is employed to identify the top-K relevant news articles with content that deviates the most from the original news, which ensures the acquisition of multiple evidence with higher objectivity. Finally, we utilize an inconsistency loss function within the evidence fusion layer to strengthen the consistency of two types of evidence, both negating the authenticity of the same news. Extensive experiments and ablation studies on real-world dataset FibVID show the effectiveness of our proposed model. Notably, EMIF shows remarkable robustness even in scenarios where a particular source of information is inadequate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01213v1-abstract-full').style.display = 'none'; document.getElementById('2407.01213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16985">arXiv:2406.16985</a> <span> [<a href="https://arxiv.org/pdf/2406.16985">pdf</a>, <a href="https://arxiv.org/format/2406.16985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unveiling LLM Mechanisms Through Neural ODEs and Control Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yukun Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qi Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16985v2-abstract-short" style="display: inline;"> This paper proposes a framework combining Neural Ordinary Differential Equations (Neural ODEs) and robust control theory to enhance the interpretability and control of large language models (LLMs). By utilizing Neural ODEs to model the dynamic evolution of input-output relationships and introducing control mechanisms to optimize output quality, we demonstrate the effectiveness of this approach acr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16985v2-abstract-full').style.display = 'inline'; document.getElementById('2406.16985v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16985v2-abstract-full" style="display: none;"> This paper proposes a framework combining Neural Ordinary Differential Equations (Neural ODEs) and robust control theory to enhance the interpretability and control of large language models (LLMs). By utilizing Neural ODEs to model the dynamic evolution of input-output relationships and introducing control mechanisms to optimize output quality, we demonstrate the effectiveness of this approach across multiple question-answer datasets. Experimental results show that the integration of Neural ODEs and control theory significantly improves output consistency and model interpretability, advancing the development of explainable AI technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16985v2-abstract-full').style.display = 'none'; document.getElementById('2406.16985v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15846">arXiv:2406.15846</a> <span> [<a href="https://arxiv.org/pdf/2406.15846">pdf</a>, <a href="https://arxiv.org/format/2406.15846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Interpolation Augmentation for Speech-to-Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoqian Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a>, <a href="/search/cs?searchtype=author&query=Man%2C+D">Dapeng Man</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15846v1-abstract-short" style="display: inline;"> Speech-to-text (S2T) generation systems frequently face challenges in low-resource scenarios, primarily due to the lack of extensive labeled datasets. One emerging solution is constructing virtual training samples by interpolating inputs and labels, which has notably enhanced system generalization in other domains. Despite its potential, this technique's application in S2T tasks has remained under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15846v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15846v1-abstract-full" style="display: none;"> Speech-to-text (S2T) generation systems frequently face challenges in low-resource scenarios, primarily due to the lack of extensive labeled datasets. One emerging solution is constructing virtual training samples by interpolating inputs and labels, which has notably enhanced system generalization in other domains. Despite its potential, this technique's application in S2T tasks has remained under-explored. In this paper, we delve into the utility of interpolation augmentation, guided by several pivotal questions. Our findings reveal that employing an appropriate strategy in interpolation augmentation significantly enhances performance across diverse tasks, architectures, and data scales, offering a promising avenue for more robust S2T systems in resource-constrained settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15846v1-abstract-full').style.display = 'none'; document.getElementById('2406.15846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17158">arXiv:2405.17158</a> <span> [<a href="https://arxiv.org/pdf/2405.17158">pdf</a>, <a href="https://arxiv.org/format/2405.17158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PatchScaler: An Efficient Patch-Independent Diffusion Model for Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hang Dong</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingji Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rongxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Lean Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17158v4-abstract-short" style="display: inline;"> While diffusion models significantly improve the perceptual quality of super-resolved images, they usually require a large number of sampling steps, resulting in high computational costs and long inference times. Recent efforts have explored reasonable acceleration schemes by reducing the number of sampling steps. However, these approaches treat all regions of the image equally, overlooking the fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17158v4-abstract-full').style.display = 'inline'; document.getElementById('2405.17158v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17158v4-abstract-full" style="display: none;"> While diffusion models significantly improve the perceptual quality of super-resolved images, they usually require a large number of sampling steps, resulting in high computational costs and long inference times. Recent efforts have explored reasonable acceleration schemes by reducing the number of sampling steps. However, these approaches treat all regions of the image equally, overlooking the fact that regions with varying levels of reconstruction difficulty require different sampling steps. To address this limitation, we propose PatchScaler, an efficient patch-independent diffusion pipeline for single image super-resolution. Specifically, PatchScaler introduces a Patch-adaptive Group Sampling (PGS) strategy that groups feature patches by quantifying their reconstruction difficulty and establishes shortcut paths with different sampling configurations for each group. To further optimize the patch-level reconstruction process of PGS, we propose a texture prompt that provides rich texture conditional information to the diffusion model. The texture prompt adaptively retrieves texture priors for the target patch from a common reference texture memory. Extensive experiments show that our PatchScaler achieves superior performance in both quantitative and qualitative evaluations, while significantly speeding up inference. Our code will be available at \url{https://github.com/yongliuy/PatchScaler}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17158v4-abstract-full').style.display = 'none'; document.getElementById('2405.17158v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13745">arXiv:2405.13745</a> <span> [<a href="https://arxiv.org/pdf/2405.13745">pdf</a>, <a href="https://arxiv.org/format/2405.13745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeurCross: A Self-Supervised Neural Approach for Representing Cross Fields in Quad Mesh Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiujie Dong</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Huibiao Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rui Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaokang Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiaran Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuangmin Chen</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+S">Shiqing Xin</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+C">Changhe Tu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13745v1-abstract-short" style="display: inline;"> Quadrilateral mesh generation plays a crucial role in numerical simulations within Computer-Aided Design and Engineering (CAD/E). The quality of the cross field is essential for generating a quadrilateral mesh. In this paper, we propose a self-supervised neural representation of the cross field, named NeurCross, comprising two modules: one to fit the signed distance function (SDF) and another to p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13745v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13745v1-abstract-full" style="display: none;"> Quadrilateral mesh generation plays a crucial role in numerical simulations within Computer-Aided Design and Engineering (CAD/E). The quality of the cross field is essential for generating a quadrilateral mesh. In this paper, we propose a self-supervised neural representation of the cross field, named NeurCross, comprising two modules: one to fit the signed distance function (SDF) and another to predict the cross field. Unlike most existing approaches that operate directly on the given polygonal surface, NeurCross takes the SDF as a bridge to allow for SDF overfitting and the prediction of the cross field to proceed simultaneously. By utilizing a neural SDF, we achieve a smooth representation of the base surface, minimizing the impact of piecewise planar discretization and minor surface variations. Moreover, the principal curvatures and directions are fully encoded by the Hessian of the SDF, enabling the regularization of the overall cross field through minor adjustments to the SDF. Compared to state-of-the-art methods, NeurCross significantly improves the placement of singular points and the approximation accuracy between the input triangular surface and the output quad mesh, as demonstrated in the teaser figure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13745v1-abstract-full').style.display = 'none'; document.getElementById('2405.13745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13420">arXiv:2404.13420</a> <span> [<a href="https://arxiv.org/pdf/2404.13420">pdf</a>, <a href="https://arxiv.org/format/2404.13420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeurCADRecon: Neural Representation for Reconstructing CAD Surfaces by Enforcing Zero Gaussian Curvature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiujie Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rui Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengfei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuangmin Chen</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+S">Shiqing Xin</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaohong Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+C">Changhe Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13420v1-abstract-short" style="display: inline;"> Despite recent advances in reconstructing an organic model with the neural signed distance function (SDF), the high-fidelity reconstruction of a CAD model directly from low-quality unoriented point clouds remains a significant challenge. In this paper, we address this challenge based on the prior observation that the surface of a CAD model is generally composed of piecewise surface patches, each a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13420v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13420v1-abstract-full" style="display: none;"> Despite recent advances in reconstructing an organic model with the neural signed distance function (SDF), the high-fidelity reconstruction of a CAD model directly from low-quality unoriented point clouds remains a significant challenge. In this paper, we address this challenge based on the prior observation that the surface of a CAD model is generally composed of piecewise surface patches, each approximately developable even around the feature line. Our approach, named NeurCADRecon, is self-supervised, and its loss includes a developability term to encourage the Gaussian curvature toward 0 while ensuring fidelity to the input points. Noticing that the Gaussian curvature is non-zero at tip points, we introduce a double-trough curve to tolerate the existence of these tip points. Furthermore, we develop a dynamic sampling strategy to deal with situations where the given points are incomplete or too sparse. Since our resulting neural SDFs can clearly manifest sharp feature points/lines, one can easily extract the feature-aligned triangle mesh from the SDF and then decompose it into smooth surface patches, greatly reducing the difficulty of recovering the parametric CAD design. A comprehensive comparison with existing state-of-the-art methods shows the significant advantage of our approach in reconstructing faithful CAD shapes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13420v1-abstract-full').style.display = 'none'; document.getElementById('2404.13420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Transactions on Graphics (SIGGRAPH 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04808">arXiv:2404.04808</a> <span> [<a href="https://arxiv.org/pdf/2404.04808">pdf</a>, <a href="https://arxiv.org/format/2404.04808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MemFlow: Optical Flow Estimation and Prediction with Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiaole Dong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04808v1-abstract-short" style="display: inline;"> Optical flow is a classical task that is important to the vision community. Classical optical flow estimation uses two frames as input, whilst some recent methods consider multiple frames to explicitly model long-range information. The former ones limit their ability to fully leverage temporal coherence along the video sequence; and the latter ones incur heavy computational overhead, typically not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04808v1-abstract-full').style.display = 'inline'; document.getElementById('2404.04808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04808v1-abstract-full" style="display: none;"> Optical flow is a classical task that is important to the vision community. Classical optical flow estimation uses two frames as input, whilst some recent methods consider multiple frames to explicitly model long-range information. The former ones limit their ability to fully leverage temporal coherence along the video sequence; and the latter ones incur heavy computational overhead, typically not possible for real-time flow estimation. Some multi-frame-based approaches even necessitate unseen future frames for current estimation, compromising real-time applicability in safety-critical scenarios. To this end, we present MemFlow, a real-time method for optical flow estimation and prediction with memory. Our method enables memory read-out and update modules for aggregating historical motion information in real-time. Furthermore, we integrate resolution-adaptive re-scaling to accommodate diverse video resolutions. Besides, our approach seamlessly extends to the future prediction of optical flow based on past observations. Leveraging effective historical motion aggregation, our method outperforms VideoFlow with fewer parameters and faster inference speed on Sintel and KITTI-15 datasets in terms of generalization performance. At the time of submission, MemFlow also leads in performance on the 1080p Spring dataset. Codes and models will be available at: https://dqiaole.github.io/MemFlow/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04808v1-abstract-full').style.display = 'none'; document.getElementById('2404.04808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18435">arXiv:2403.18435</a> <span> [<a href="https://arxiv.org/pdf/2403.18435">pdf</a>, <a href="https://arxiv.org/format/2403.18435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DELTA: Pre-train a Discriminative Encoder for Legal Case Retrieval via Structural Word Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xinyan Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jia Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18435v1-abstract-short" style="display: inline;"> Recent research demonstrates the effectiveness of using pre-trained language models for legal case retrieval. Most of the existing works focus on improving the representation ability for the contextualized embedding of the [CLS] token and calculate relevance using textual semantic similarity. However, in the legal domain, textual semantic similarity does not always imply that the cases are relevan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18435v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18435v1-abstract-full" style="display: none;"> Recent research demonstrates the effectiveness of using pre-trained language models for legal case retrieval. Most of the existing works focus on improving the representation ability for the contextualized embedding of the [CLS] token and calculate relevance using textual semantic similarity. However, in the legal domain, textual semantic similarity does not always imply that the cases are relevant enough. Instead, relevance in legal cases primarily depends on the similarity of key facts that impact the final judgment. Without proper treatments, the discriminative ability of learned representations could be limited since legal cases are lengthy and contain numerous non-key facts. To this end, we introduce DELTA, a discriminative model designed for legal case retrieval. The basic idea involves pinpointing key facts in legal cases and pulling the contextualized embedding of the [CLS] token closer to the key facts while pushing away from the non-key facts, which can warm up the case embedding space in an unsupervised manner. To be specific, this study brings the word alignment mechanism to the contextual masked auto-encoder. First, we leverage shallow decoders to create information bottlenecks, aiming to enhance the representation ability. Second, we employ the deep decoder to enable translation between different structures, with the goal of pinpointing key facts to enhance discriminative ability. Comprehensive experiments conducted on publicly available legal benchmarks show that our approach can outperform existing state-of-the-art methods in legal case retrieval. It provides a new perspective on the in-depth understanding and processing of legal case documents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18435v1-abstract-full').style.display = 'none'; document.getElementById('2403.18435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18365">arXiv:2403.18365</a> <span> [<a href="https://arxiv.org/pdf/2403.18365">pdf</a>, <a href="https://arxiv.org/format/2403.18365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BLADE: Enhancing Black-box Large Language Models with Small Domain-Specific Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jia Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qian Dong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhijing Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18365v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) like ChatGPT and GPT-4 are versatile and capable of addressing a diverse range of tasks. However, general LLMs, which are developed on open-domain data, may lack the domain-specific knowledge essential for tasks in vertical domains, such as legal, medical, etc. To address this issue, previous approaches either conduct continuous pre-training with domain-specific data o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18365v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18365v1-abstract-full" style="display: none;"> Large Language Models (LLMs) like ChatGPT and GPT-4 are versatile and capable of addressing a diverse range of tasks. However, general LLMs, which are developed on open-domain data, may lack the domain-specific knowledge essential for tasks in vertical domains, such as legal, medical, etc. To address this issue, previous approaches either conduct continuous pre-training with domain-specific data or employ retrieval augmentation to support general LLMs. Unfortunately, these strategies are either cost-intensive or unreliable in practical applications. To this end, we present a novel framework named BLADE, which enhances Black-box LArge language models with small Domain-spEcific models. BLADE consists of a black-box LLM and a small domain-specific LM. The small LM preserves domain-specific knowledge and offers specialized insights, while the general LLM contributes robust language comprehension and reasoning capabilities. Specifically, our method involves three steps: 1) pre-training the small LM with domain-specific data, 2) fine-tuning this model using knowledge instruction data, and 3) joint Bayesian optimization of the general LLM and the small LM. Extensive experiments conducted on public legal and medical benchmarks reveal that BLADE significantly outperforms existing approaches. This shows the potential of BLADE as an effective and cost-efficient solution in adapting general LLMs for vertical domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18365v1-abstract-full').style.display = 'none'; document.getElementById('2403.18365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02249">arXiv:2403.02249</a> <span> [<a href="https://arxiv.org/pdf/2403.02249">pdf</a>, <a href="https://arxiv.org/format/2403.02249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Non-autoregressive Sequence-to-Sequence Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kunyu Shi</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qi Dong</a>, <a href="/search/cs?searchtype=author&query=Goncalves%2C+L">Luis Goncalves</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhuowen Tu</a>, <a href="/search/cs?searchtype=author&query=Soatto%2C+S">Stefano Soatto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02249v2-abstract-short" style="display: inline;"> Sequence-to-sequence vision-language models are showing promise, but their applicability is limited by their inference latency due to their autoregressive way of generating predictions. We propose a parallel decoding sequence-to-sequence vision-language model, trained with a Query-CTC loss, that marginalizes over multiple inference paths in the decoder. This allows us to model the joint distributi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02249v2-abstract-full').style.display = 'inline'; document.getElementById('2403.02249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02249v2-abstract-full" style="display: none;"> Sequence-to-sequence vision-language models are showing promise, but their applicability is limited by their inference latency due to their autoregressive way of generating predictions. We propose a parallel decoding sequence-to-sequence vision-language model, trained with a Query-CTC loss, that marginalizes over multiple inference paths in the decoder. This allows us to model the joint distribution of tokens, rather than restricting to conditional distribution as in an autoregressive model. The resulting model, NARVL, achieves performance on-par with its state-of-the-art autoregressive counterpart, but is faster at inference time, reducing from the linear complexity associated with the sequential generation of tokens to a paradigm of constant time joint inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02249v2-abstract-full').style.display = 'none'; document.getElementById('2403.02249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01954">arXiv:2403.01954</a> <span> [<a href="https://arxiv.org/pdf/2403.01954">pdf</a>, <a href="https://arxiv.org/format/2403.01954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> DECIDER: A Dual-System Rule-Controllable Decoding Framework for Language Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Changlong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jun Gao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yu Ji</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qunxi Dong</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+K">Kun Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Piji Li</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+W">Wei Bi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01954v3-abstract-short" style="display: inline;"> Constrained decoding approaches aim to control the meaning or style of text generated by a Pre-trained Language Model (PLM) using specific target words during inference. However, these methods often guide plausible continuations by greedily selecting targets, which, while completing the task, may disrupt the natural patterns of human language generation. In this work, we propose a novel decoding f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01954v3-abstract-full').style.display = 'inline'; document.getElementById('2403.01954v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01954v3-abstract-full" style="display: none;"> Constrained decoding approaches aim to control the meaning or style of text generated by a Pre-trained Language Model (PLM) using specific target words during inference. However, these methods often guide plausible continuations by greedily selecting targets, which, while completing the task, may disrupt the natural patterns of human language generation. In this work, we propose a novel decoding framework, DECIDER, which enables us to program rules on how we complete tasks to control a PLM. Differing from previous work, our framework transforms the encouragement of target words into the encouragement of all words that satisfy the rule. Specifically, DECIDER is a dual system where a PLM is equipped with a First-OrderLogic (FOL) reasoner to express and evaluate the rules, and a decision function to merge the outputs from both systems to steer the generation. Experiments on CommonGen and PersonaChat demonstrate that DECIDER can effectively follow given rules to achieve generation tasks in a more human-like manner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01954v3-abstract-full').style.display = 'none'; document.getElementById('2403.01954v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE TKDE (Major Revision), 13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17759">arXiv:2402.17759</a> <span> [<a href="https://arxiv.org/pdf/2402.17759">pdf</a>, <a href="https://arxiv.org/format/2402.17759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Optimal Learning of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuxian Gu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yaru Hao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17759v2-abstract-short" style="display: inline;"> This work studies the general principles of improving the learning of language models (LMs), which aims at reducing the necessary training steps for achieving superior performance. Specifically, we present a theory for the optimal learning of LMs. We first propose an objective that optimizes LM learning by maximizing the data compression ratio in an "LM-training-as-lossless-compression" view. Then… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17759v2-abstract-full').style.display = 'inline'; document.getElementById('2402.17759v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17759v2-abstract-full" style="display: none;"> This work studies the general principles of improving the learning of language models (LMs), which aims at reducing the necessary training steps for achieving superior performance. Specifically, we present a theory for the optimal learning of LMs. We first propose an objective that optimizes LM learning by maximizing the data compression ratio in an "LM-training-as-lossless-compression" view. Then, we derive a theorem, named Learning Law, to reveal the properties of the dynamics in the optimal learning process under our objective. The theorem is then validated by experiments on a linear classification and a real-world language modeling task. Finally, we empirically verify that the optimal learning of LMs essentially stems from the improvement of the coefficients in the scaling law of LMs, indicating great promise and significance for designing practical learning acceleration methods. Our code can be found at https://aka.ms/LearningLaw. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17759v2-abstract-full').style.display = 'none'; document.getElementById('2402.17759v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16141">arXiv:2402.16141</a> <span> [<a href="https://arxiv.org/pdf/2402.16141">pdf</a>, <a href="https://arxiv.org/format/2402.16141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PeriodicLoRA: Breaking the Low-Rank Bottleneck in LoRA Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangdi Meng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weiyao Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaoxiang Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaochen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16141v1-abstract-short" style="display: inline;"> Supervised fine-tuning is the most common method to adapt large language models (LLMs) to downstream tasks, but full fine-tuning LLMs requires massive computational resources. Recently, parameter-efficient fine-tuning (PEFT) methods have been widely studied due to its cost-effectiveness. LoRA is one of the most widely used methods, which assumes that the optimization process is essentially low-dim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16141v1-abstract-full').style.display = 'inline'; document.getElementById('2402.16141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16141v1-abstract-full" style="display: none;"> Supervised fine-tuning is the most common method to adapt large language models (LLMs) to downstream tasks, but full fine-tuning LLMs requires massive computational resources. Recently, parameter-efficient fine-tuning (PEFT) methods have been widely studied due to its cost-effectiveness. LoRA is one of the most widely used methods, which assumes that the optimization process is essentially low-dimensional. Although LoRA fine-tuning is effective, there is still a performance gap compared to full fine-tuning, since its weight update is limited to low-rank matrices. In order to break the low-rank bottleneck in LoRA Optimization, we propose PeriodicLoRA (PLoRA), which accumulates low-rank update matrices multiple times to achieve a higher update rank. PLoRA has multiple training stages. During each stage, we still update only the LoRA weights. However, at the end of each stage, we unload the LoRA weights into the backbone parameters and then reinitialize the LoRA states. Experimental results show that PLoRA has stronger learning ability, approximately 1.8 times that of LoRA's learning ability at most, but it does not increase memory usage. Further, we introduce a momentum-based unloading strategy for PLoRA to mitigate the training instability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16141v1-abstract-full').style.display = 'none'; document.getElementById('2402.16141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13064">arXiv:2402.13064</a> <span> [<a href="https://arxiv.org/pdf/2402.13064">pdf</a>, <a href="https://arxiv.org/format/2402.13064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Synthetic Data (Almost) from Scratch: Generalized Instruction Tuning for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaojun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shaohan Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaolong Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zeqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuxian Gu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xin Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si-Qing Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+W">Wai Lam</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13064v1-abstract-short" style="display: inline;"> We introduce Generalized Instruction Tuning (called GLAN), a general and scalable method for instruction tuning of Large Language Models (LLMs). Unlike prior work that relies on seed examples or existing datasets to construct instruction tuning data, GLAN exclusively utilizes a pre-curated taxonomy of human knowledge and capabilities as input and generates large-scale synthetic instruction data ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13064v1-abstract-full').style.display = 'inline'; document.getElementById('2402.13064v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13064v1-abstract-full" style="display: none;"> We introduce Generalized Instruction Tuning (called GLAN), a general and scalable method for instruction tuning of Large Language Models (LLMs). Unlike prior work that relies on seed examples or existing datasets to construct instruction tuning data, GLAN exclusively utilizes a pre-curated taxonomy of human knowledge and capabilities as input and generates large-scale synthetic instruction data across all disciplines. Specifically, inspired by the systematic structure in human education system, we build the taxonomy by decomposing human knowledge and capabilities to various fields, sub-fields and ultimately, distinct disciplines semi-automatically, facilitated by LLMs. Subsequently, we generate a comprehensive list of subjects for every discipline and proceed to design a syllabus tailored to each subject, again utilizing LLMs. With the fine-grained key concepts detailed in every class session of the syllabus, we are able to generate diverse instructions with a broad coverage across the entire spectrum of human knowledge and skills. Extensive experiments on large language models (e.g., Mistral) demonstrate that GLAN excels in multiple dimensions from mathematical reasoning, coding, academic exams, logical reasoning to general instruction following without using task-specific training data of these tasks. In addition, GLAN allows for easy customization and new fields or skills can be added by simply incorporating a new node into our taxonomy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13064v1-abstract-full').style.display = 'none'; document.getElementById('2402.13064v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11281">arXiv:2402.11281</a> <span> [<a href="https://arxiv.org/pdf/2402.11281">pdf</a>, <a href="https://arxiv.org/format/2402.11281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Large Multimodal Models Uncover Deep Semantics Behind Images? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yixin Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11281v3-abstract-short" style="display: inline;"> Understanding the deep semantics of images is essential in the era dominated by social media. However, current research works primarily on the superficial description of images, revealing a notable deficiency in the systematic investigation of the inherent deep semantics. In this work, we introduce DEEPEVAL, a comprehensive benchmark to assess Large Multimodal Models' (LMMs) capacities of visual d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11281v3-abstract-full').style.display = 'inline'; document.getElementById('2402.11281v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11281v3-abstract-full" style="display: none;"> Understanding the deep semantics of images is essential in the era dominated by social media. However, current research works primarily on the superficial description of images, revealing a notable deficiency in the systematic investigation of the inherent deep semantics. In this work, we introduce DEEPEVAL, a comprehensive benchmark to assess Large Multimodal Models' (LMMs) capacities of visual deep semantics. DEEPEVAL includes human-annotated dataset and three progressive subtasks: fine-grained description selection, in-depth title matching, and deep semantics understanding. Utilizing DEEPEVAL, we evaluate 9 open-source LMMs and GPT-4V(ision). Our evaluation demonstrates a substantial gap between the deep semantic comprehension capabilities of existing LMMs and humans. For example, GPT-4V is 30% behind humans in understanding deep semantics, even though it achieves human-comparable performance in image description. Further analysis reveals that LMM performance on DEEPEVAL varies according to the specific facets of deep semantics explored, indicating the fundamental challenges remaining in developing LMMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11281v3-abstract-full').style.display = 'none'; document.getElementById('2402.11281v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16861">arXiv:2401.16861</a> <span> [<a href="https://arxiv.org/pdf/2401.16861">pdf</a>, <a href="https://arxiv.org/format/2401.16861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Repositioning the Subject within Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chenjie Cao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+K">Ke Fan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qiaole Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifan Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiangyang Xue</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16861v3-abstract-short" style="display: inline;"> Current image manipulation primarily centers on static manipulation, such as replacing specific regions within an image or altering its overall style. In this paper, we introduce an innovative dynamic manipulation task, subject repositioning. This task involves relocating a user-specified subject to a desired position while preserving the image's fidelity. Our research reveals that the fundamental… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16861v3-abstract-full').style.display = 'inline'; document.getElementById('2401.16861v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16861v3-abstract-full" style="display: none;"> Current image manipulation primarily centers on static manipulation, such as replacing specific regions within an image or altering its overall style. In this paper, we introduce an innovative dynamic manipulation task, subject repositioning. This task involves relocating a user-specified subject to a desired position while preserving the image's fidelity. Our research reveals that the fundamental sub-tasks of subject repositioning, which include filling the void left by the repositioned subject, reconstructing obscured portions of the subject and blending the subject to be consistent with surrounding areas, can be effectively reformulated as a unified, prompt-guided inpainting task. Consequently, we can employ a single diffusion generative model to address these sub-tasks using various task prompts learned through our proposed task inversion technique. Additionally, we integrate pre-processing and post-processing techniques to further enhance the quality of subject repositioning. These elements together form our SEgment-gEnerate-and-bLEnd (SEELE) framework. To assess SEELE's effectiveness in subject repositioning, we assemble a real-world subject repositioning dataset called ReS. Results of SEELE on ReS demonstrate its efficacy. Code and ReS dataset are available at https://yikai-wang.github.io/seele/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16861v3-abstract-full').style.display = 'none'; document.getElementById('2401.16861v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TMLR. Arxiv version uses small size images. Full size PDF, code, and dataset are available at https://yikai-wang.github.io/seele/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.07851">arXiv:2401.07851</a> <span> [<a href="https://arxiv.org/pdf/2401.07851">pdf</a>, <a href="https://arxiv.org/format/2401.07851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+T">Tao Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhifang Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.07851v3-abstract-short" style="display: inline;"> To mitigate the high inference latency stemming from autoregressive decoding in Large Language Models (LLMs), Speculative Decoding has emerged as a novel decoding paradigm for LLM inference. In each decoding step, this method first drafts several future tokens efficiently and then verifies them in parallel. Unlike autoregressive decoding, Speculative Decoding facilitates the simultaneous decoding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07851v3-abstract-full').style.display = 'inline'; document.getElementById('2401.07851v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.07851v3-abstract-full" style="display: none;"> To mitigate the high inference latency stemming from autoregressive decoding in Large Language Models (LLMs), Speculative Decoding has emerged as a novel decoding paradigm for LLM inference. In each decoding step, this method first drafts several future tokens efficiently and then verifies them in parallel. Unlike autoregressive decoding, Speculative Decoding facilitates the simultaneous decoding of multiple tokens per step, thereby accelerating inference. This paper presents a comprehensive overview and analysis of this promising decoding paradigm. We begin by providing a formal definition and formulation of Speculative Decoding. Then, we organize in-depth discussions on its key facets, such as drafter selection and verification strategies. Furthermore, we present a comparative analysis of leading methods under third-party testing environments. We aim for this work to serve as a catalyst for further research on Speculative Decoding, ultimately contributing to more efficient LLM inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07851v3-abstract-full').style.display = 'none'; document.getElementById('2401.07851v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Findings (Long Paper), camera-ready version</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>