CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 847 results for author: <span class="mathjax">Sun, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Sun%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Sun, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Sun%2C+M&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Sun, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14279">arXiv:2411.14279</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14279">pdf</a>, <a href="https://arxiv.org/format/2411.14279">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Looking Beyond Text: Reducing Language bias in Large Vision-Language Models via Multimodal Dual-Attention and Soft-Image Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+S">Shuzheng Si</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mingjia Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14279v1-abstract-short" style="display: inline;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14279v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14279v1-abstract-full" style="display: none;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage of LLM and multimodal alignment stage. 2. The learned inference bias due to short-term dependency of text data. Therefore, we propose LACING, a systemic framework designed to address the language bias of LVLMs with muLtimodal duAl-attention meChanIsm (MDA) aNd soft-image Guidance (IFG). Specifically, MDA introduces a parallel dual-attention mechanism that enhances the integration of visual inputs across the model. IFG introduces a learnable soft visual prompt during training and inference to replace visual inputs, designed to compel LVLMs to prioritize text inputs. Then, IFG further proposes a novel decoding strategy using the soft visual prompt to mitigate the model&#39;s over-reliance on adjacent text inputs. Comprehensive experiments demonstrate that our method effectively debiases LVLMs from their language bias, enhancing visual comprehension and reducing hallucinations without requiring additional training resources or data. The code and model are available at [lacing-lvlm.github.io](https://lacing-lvlm.github.io). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'none'; document.getElementById('2411.14279v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12776">arXiv:2411.12776</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12776">pdf</a>, <a href="https://arxiv.org/format/2411.12776">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Cross-Layer Encrypted Semantic Communication Framework for Panoramic Video Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Haixiao Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengying Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xiaodong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Bingxuan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+S">Shujun Han</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bizhu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+S">Sheng Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+C">Chen Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12776v1-abstract-short" style="display: inline;"> In this paper, we propose a cross-layer encrypted semantic communication (CLESC) framework for panoramic video transmission, incorporating feature extraction, encoding, encryption, cyclic redundancy check (CRC), and retransmission processes to achieve compatibility between semantic communication and traditional communication systems. Additionally, we propose an adaptive cross-layer transmission me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12776v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12776v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12776v1-abstract-full" style="display: none;"> In this paper, we propose a cross-layer encrypted semantic communication (CLESC) framework for panoramic video transmission, incorporating feature extraction, encoding, encryption, cyclic redundancy check (CRC), and retransmission processes to achieve compatibility between semantic communication and traditional communication systems. Additionally, we propose an adaptive cross-layer transmission mechanism that dynamically adjusts CRC, channel coding, and retransmission schemes based on the importance of semantic information. This ensures that important information is prioritized under poor transmission conditions. To verify the aforementioned framework, we also design an end-to-end adaptive panoramic video semantic transmission (APVST) network that leverages a deep joint source-channel coding (Deep JSCC) structure and attention mechanism, integrated with a latitude adaptive module that facilitates adaptive semantic feature extraction and variable-length encoding of panoramic videos. The proposed CLESC is also applicable to the transmission of other modal data. Simulation results demonstrate that the proposed CLESC effectively achieves compatibility and adaptation between semantic communication and traditional communication systems, improving both transmission efficiency and channel adaptability. Compared to traditional cross-layer transmission schemes, the CLESC framework can reduce bandwidth consumption by 85% while showing significant advantages under low signal-to-noise ratio (SNR) conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12776v1-abstract-full').style.display = 'none'; document.getElementById('2411.12776v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11848">arXiv:2411.11848</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11848">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Robust Graph Neural Networks for Stability Analysis in Dynamic Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengfang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tong Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Wenying Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11848v1-abstract-short" style="display: inline;"> In the current context of accelerated globalization and digitalization, the complexity and uncertainty of financial markets are increasing, and the identification and prevention of economic risks have become a key link in maintaining the stability of the financial system. Traditional risk identification methods often have limitations because they are difficult to cope with the multi-level and dyna&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11848v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11848v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11848v1-abstract-full" style="display: none;"> In the current context of accelerated globalization and digitalization, the complexity and uncertainty of financial markets are increasing, and the identification and prevention of economic risks have become a key link in maintaining the stability of the financial system. Traditional risk identification methods often have limitations because they are difficult to cope with the multi-level and dynamically changing complex relationships in financial networks. With the rapid development of financial technology, graph neural network (GNN) technology, as an emerging deep learning method, has gradually shown great potential in the field of financial risk management. GNN can map transaction behaviors, financial institutions, individuals, and their interactive relationships in financial networks into graph structures, and effectively capture potential patterns and abnormal signals in financial data through embedded representation learning. Using this technology, financial institutions can extract valuable information from complex transaction networks, identify hidden dangers or abnormal behaviors that may cause systemic risks in a timely manner, optimize decision-making processes, and improve the accuracy of risk warnings. This paper explores the economic risk identification algorithm based on the GNN algorithm, aiming to provide financial institutions and regulators with more intelligent technical tools to help maintain the security and stability of the financial market. Improving the efficiency of economic risk identification through innovative technical means is expected to further enhance the risk resistance of the financial system and lay the foundation for building a robust global financial system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11848v1-abstract-full').style.display = 'none'; document.getElementById('2411.11848v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">It was accepted by the 3rd International Conference on Cloud Computing Big Data Application and Software Engineering</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11843">arXiv:2411.11843</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11843">pdf</a>, <a href="https://arxiv.org/format/2411.11843">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bi-Mamba: Towards Accurate 1-Bit State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shengkun Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Liqun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haonan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingjie Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhiqiang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11843v1-abstract-short" style="display: inline;"> The typical selective state-space model (SSM) of Mamba addresses several limitations of Transformers, such as quadratic computational complexity with sequence length and significant inference-time memory requirements due to the key-value cache. However, the growing size of Mamba models continues to pose training and deployment challenges and raises environmental concerns due to considerable energy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11843v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11843v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11843v1-abstract-full" style="display: none;"> The typical selective state-space model (SSM) of Mamba addresses several limitations of Transformers, such as quadratic computational complexity with sequence length and significant inference-time memory requirements due to the key-value cache. However, the growing size of Mamba models continues to pose training and deployment challenges and raises environmental concerns due to considerable energy consumption. In this work, we introduce Bi-Mamba, a scalable and powerful 1-bit Mamba architecture designed for more efficient large language models with multiple sizes across 780M, 1.3B, and 2.7B. Bi-Mamba models are trained from scratch on data volume as regular LLM pertaining using an autoregressive distillation loss. Extensive experimental results on language modeling demonstrate that Bi-Mamba achieves performance comparable to its full-precision counterparts (e.g., FP16 or BF16) and much better accuracy than post-training-binarization (PTB) Mamba baselines, while significantly reducing memory footprint and energy consumption compared to the original Mamba model. Our study pioneers a new linear computational complexity LLM framework under low-bit representation and facilitates the future design of specialized hardware tailored for efficient 1-bit Mamba-based LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11843v1-abstract-full').style.display = 'none'; document.getElementById('2411.11843v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10368">arXiv:2411.10368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10368">pdf</a>, <a href="https://arxiv.org/format/2411.10368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Mechanisms of Generative Image-to-Image Translation Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangzong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingui Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Z">Zhi-Hong Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Kangni Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+W">Wenyan Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10368v1-abstract-short" style="display: inline;"> Generative Adversarial Networks (GANs) are a class of neural networks that have been widely used in the field of image-to-image translation. In this paper, we propose a streamlined image-to-image translation network with a simpler architecture compared to existing models. We investigate the relationship between GANs and autoencoders and provide an explanation for the efficacy of employing only the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10368v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10368v1-abstract-full" style="display: none;"> Generative Adversarial Networks (GANs) are a class of neural networks that have been widely used in the field of image-to-image translation. In this paper, we propose a streamlined image-to-image translation network with a simpler architecture compared to existing models. We investigate the relationship between GANs and autoencoders and provide an explanation for the efficacy of employing only the GAN component for tasks involving image translation. We show that adversarial for GAN models yields results comparable to those of existing methods without additional complex loss penalties. Subsequently, we elucidate the rationale behind this phenomenon. We also incorporate experimental results to demonstrate the validity of our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10368v1-abstract-full').style.display = 'none'; document.getElementById('2411.10368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09906">arXiv:2411.09906</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09906">pdf</a>, <a href="https://arxiv.org/format/2411.09906">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Machine Learning-based Physical-Layer Authentication in Wireless Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+R">Rui Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Bingxuan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xiaodong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengying Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wanga%2C+B">Bizhu Wanga</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+S">Shujun Han</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+S">Suyu Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09906v1-abstract-short" style="display: inline;"> To ensure secure and reliable communication in wireless systems, authenticating the identities of numerous nodes is imperative. Traditional cryptography-based authentication methods suffer from issues such as low compatibility, reliability, and high complexity. Physical-Layer Authentication (PLA) is emerging as a promising complement due to its exploitation of unique properties in wireless environ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09906v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09906v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09906v1-abstract-full" style="display: none;"> To ensure secure and reliable communication in wireless systems, authenticating the identities of numerous nodes is imperative. Traditional cryptography-based authentication methods suffer from issues such as low compatibility, reliability, and high complexity. Physical-Layer Authentication (PLA) is emerging as a promising complement due to its exploitation of unique properties in wireless environments. Recently, Machine Learning (ML)-based PLA has gained attention for its intelligence, adaptability, universality, and scalability compared to non-ML approaches. However, a comprehensive overview of state-of-the-art ML-based PLA and its foundational aspects is lacking. This paper presents a comprehensive survey of characteristics and technologies that can be used in the ML-based PLA. We categorize existing ML-based PLA schemes into two main types: multi-device identification and attack detection schemes. In deep learning-based multi-device identification schemes, Deep Neural Networks are employed to train models, avoiding complex processing and expert feature transformation. Deep learning-based multi-device identification schemes are further subdivided, with schemes based on Convolutional Neural Networks being extensively researched. In ML-based attack detection schemes, receivers utilize intelligent ML techniques to set detection thresholds automatically, eliminating the need for manual calculation or knowledge of channel models. ML-based attack detection schemes are categorized into three sub-types: Supervised Learning, Unsupervised Learning, and Reinforcement Learning. Additionally, we summarize open-source datasets used for PLA, encompassing Radio Frequency fingerprints and channel fingerprints. Finally, this paper outlines future research directions to guide researchers in related fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09906v1-abstract-full').style.display = 'none'; document.getElementById('2411.09906v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">111 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07207">arXiv:2411.07207</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07207">pdf</a>, <a href="https://arxiv.org/format/2411.07207">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> General Geospatial Inference with a Population Dynamics Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+M">Mohit Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mimi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Kamath%2C+C">Chaitanya Kamath</a>, <a href="/search/cs?searchtype=author&amp;query=Muslim%2C+A">Arbaaz Muslim</a>, <a href="/search/cs?searchtype=author&amp;query=Sarker%2C+P">Prithul Sarker</a>, <a href="/search/cs?searchtype=author&amp;query=Paul%2C+J">Joydeep Paul</a>, <a href="/search/cs?searchtype=author&amp;query=Yee%2C+H">Hector Yee</a>, <a href="/search/cs?searchtype=author&amp;query=Sieniek%2C+M">Marcin Sieniek</a>, <a href="/search/cs?searchtype=author&amp;query=Jablonski%2C+K">Kim Jablonski</a>, <a href="/search/cs?searchtype=author&amp;query=Mayer%2C+Y">Yael Mayer</a>, <a href="/search/cs?searchtype=author&amp;query=Fork%2C+D">David Fork</a>, <a href="/search/cs?searchtype=author&amp;query=de+Guia%2C+S">Sheila de Guia</a>, <a href="/search/cs?searchtype=author&amp;query=McPike%2C+J">Jamie McPike</a>, <a href="/search/cs?searchtype=author&amp;query=Boulanger%2C+A">Adam Boulanger</a>, <a href="/search/cs?searchtype=author&amp;query=Shekel%2C+T">Tomer Shekel</a>, <a href="/search/cs?searchtype=author&amp;query=Schottlander%2C+D">David Schottlander</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yao Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Manukonda%2C+M+C">Manjit Chakravarthy Manukonda</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Bulut%2C+N">Neslihan Bulut</a>, <a href="/search/cs?searchtype=author&amp;query=Abu-el-haija%2C+S">Sami Abu-el-haija</a>, <a href="/search/cs?searchtype=author&amp;query=Eigenwillig%2C+A">Arno Eigenwillig</a>, <a href="/search/cs?searchtype=author&amp;query=Kothari%2C+P">Parth Kothari</a>, <a href="/search/cs?searchtype=author&amp;query=Perozzi%2C+B">Bryan Perozzi</a>, <a href="/search/cs?searchtype=author&amp;query=Bharel%2C+M">Monica Bharel</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07207v2-abstract-short" style="display: inline;"> Supporting the health and well-being of dynamic populations around the world requires governmental agencies, organizations and researchers to understand and reason over complex relationships between human behavior and local contexts in order to identify high-risk groups and strategically allocate limited resources. Traditional approaches to these classes of problems often entail developing manuall&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07207v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07207v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07207v2-abstract-full" style="display: none;"> Supporting the health and well-being of dynamic populations around the world requires governmental agencies, organizations and researchers to understand and reason over complex relationships between human behavior and local contexts in order to identify high-risk groups and strategically allocate limited resources. Traditional approaches to these classes of problems often entail developing manually curated, task-specific features and models to represent human behavior and the natural and built environment, which can be challenging to adapt to new, or even, related tasks. To address this, we introduce a Population Dynamics Foundation Model (PDFM) that aims to capture the relationships between diverse data modalities and is applicable to a broad range of geospatial tasks. We first construct a geo-indexed dataset for postal codes and counties across the United States, capturing rich aggregated information on human behavior from maps, busyness, and aggregated search trends, and environmental factors such as weather and air quality. We then model this data and the complex relationships between locations using a graph neural network, producing embeddings that can be adapted to a wide range of downstream tasks using relatively simple models. We evaluate the effectiveness of our approach by benchmarking it on 27 downstream tasks spanning three distinct domains: health indicators, socioeconomic factors, and environmental measurements. The approach achieves state-of-the-art performance on all 27 geospatial interpolation tasks, and on 25 out of the 27 extrapolation and super-resolution tasks. We combined the PDFM with a state-of-the-art forecasting foundation model, TimesFM, to predict unemployment and poverty, achieving performance that surpasses fully supervised forecasting. The full set of embeddings and sample code are publicly available for researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07207v2-abstract-full').style.display = 'none'; document.getElementById('2411.07207v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 16 figures, preprint; v2: updated github url</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06525">arXiv:2411.06525</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06525">pdf</a>, <a href="https://arxiv.org/format/2411.06525">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> I2VControl-Camera: Precise Video Camera Control with Adjustable Motion Strength </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feng%2C+W">Wanquan Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiawei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+P">Pengqi Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+T">Tianhao Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingzhen Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+T">Tianxiang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Songtao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S">Siyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Q">Qian He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06525v1-abstract-short" style="display: inline;"> Video generation technologies are developing rapidly and have broad potential applications. Among these technologies, camera control is crucial for generating professional-quality videos that accurately meet user expectations. However, existing camera control methods still suffer from several limitations, including control precision and the neglect of the control for subject motion dynamics. In th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06525v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06525v1-abstract-full" style="display: none;"> Video generation technologies are developing rapidly and have broad potential applications. Among these technologies, camera control is crucial for generating professional-quality videos that accurately meet user expectations. However, existing camera control methods still suffer from several limitations, including control precision and the neglect of the control for subject motion dynamics. In this work, we propose I2VControl-Camera, a novel camera control method that significantly enhances controllability while providing adjustability over the strength of subject motion. To improve control precision, we employ point trajectory in the camera coordinate system instead of only extrinsic matrix information as our control signal. To accurately control and adjust the strength of subject motion, we explicitly model the higher-order components of the video trajectory expansion, not merely the linear terms, and design an operator that effectively represents the motion strength. We use an adapter architecture that is independent of the base model structure. Experiments on static and dynamic scenes show that our framework outperformances previous methods both quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06525v1-abstract-full').style.display = 'none'; document.getElementById('2411.06525v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05451">arXiv:2411.05451</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05451">pdf</a>, <a href="https://arxiv.org/format/2411.05451">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WorkflowLLM: Enhancing Workflow Orchestration Capability of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+S">Shengda Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Cong%2C+X">Xin Cong</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yuepeng Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shuyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuanwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yesai Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05451v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have driven a revolutionary paradigm shift in process automation from Robotic Process Automation to Agentic Process Automation by automating the workflow orchestration procedure based on LLMs. However, existing LLMs (even the advanced OpenAI GPT-4o) are confined to achieving satisfactory capability in workflow orchestration. To address this limit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05451v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05451v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05451v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have driven a revolutionary paradigm shift in process automation from Robotic Process Automation to Agentic Process Automation by automating the workflow orchestration procedure based on LLMs. However, existing LLMs (even the advanced OpenAI GPT-4o) are confined to achieving satisfactory capability in workflow orchestration. To address this limitation, we present WorkflowLLM, a data-centric framework elaborately designed to enhance the capability of LLMs in workflow orchestration. It first constructs a large-scale fine-tuning dataset WorkflowBench with 106,763 samples, covering 1,503 APIs from 83 applications across 28 categories. Specifically, the construction process can be divided into three phases: (1) Data Collection: we collect real-world workflow data from Apple Shortcuts and RoutineHub, transcribing them into Python-style code. We further equip them with generated hierarchical thought via ChatGPT. (2) Query Expansion: we prompt ChatGPT to generate more task queries to enrich the diversity and complexity of workflows. (3) Workflow Generation: we leverage an annotator model trained on collected data to generate workflows for synthesized queries. Finally, we merge the synthetic samples that pass quality confirmation with the collected samples to obtain the WorkflowBench. Based on WorkflowBench, we fine-tune Llama-3.1-8B to obtain WorkflowLlama. Our experiments show that WorkflowLlama demonstrates a strong capacity to orchestrate complex workflows, while also achieving notable generalization performance on previously unseen APIs. Additionally, WorkflowBench exhibits robust zero-shot generalization capabilities on an out-of-distribution task planning dataset, T-Eval. Our data and code are available at https://github.com/OpenBMB/WorkflowLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05451v1-abstract-full').style.display = 'none'; document.getElementById('2411.05451v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03628">arXiv:2411.03628</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03628">pdf</a>, <a href="https://arxiv.org/format/2411.03628">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Junming Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zihao Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+F">Fuwen Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03628v1-abstract-short" style="display: inline;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03628v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03628v1-abstract-full" style="display: none;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, and respond to streaming inputs in real time, highlighting the limitations of current MLLMs. In this paper, we introduce StreamingBench, the first comprehensive benchmark designed to evaluate the streaming video understanding capabilities of MLLMs. StreamingBench assesses three core aspects of streaming video understanding: (1) real-time visual understanding, (2) omni-source understanding, and (3) contextual understanding. The benchmark consists of 18 tasks, featuring 900 videos and 4,500 human-curated QA pairs. Each video features five questions presented at different time points to simulate a continuous streaming scenario. We conduct experiments on StreamingBench with 13 open-source and proprietary MLLMs and find that even the most advanced proprietary MLLMs like Gemini 1.5 Pro and GPT-4o perform significantly below human-level streaming video understanding capabilities. We hope our work can facilitate further advancements for MLLMs, empowering them to approach human-level video comprehension and interaction in more realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'none'; document.getElementById('2411.03628v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03537">arXiv:2411.03537</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03537">pdf</a>, <a href="https://arxiv.org/format/2411.03537">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Two-Stage Pretraining for Molecular Property Prediction in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wijaya%2C+K+T">Kevin Tirta Wijaya</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+M">Minghao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Michael Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Seidel%2C+H">Hans-Peter Seidel</a>, <a href="/search/cs?searchtype=author&amp;query=Matusik%2C+W">Wojciech Matusik</a>, <a href="/search/cs?searchtype=author&amp;query=Babaei%2C+V">Vahid Babaei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03537v1-abstract-short" style="display: inline;"> Accurate property prediction is crucial for accelerating the discovery of new molecules. Although deep learning models have achieved remarkable success, their performance often relies on large amounts of labeled data that are expensive and time-consuming to obtain. Thus, there is a growing need for models that can perform well with limited experimentally-validated data. In this work, we introduce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03537v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03537v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03537v1-abstract-full" style="display: none;"> Accurate property prediction is crucial for accelerating the discovery of new molecules. Although deep learning models have achieved remarkable success, their performance often relies on large amounts of labeled data that are expensive and time-consuming to obtain. Thus, there is a growing need for models that can perform well with limited experimentally-validated data. In this work, we introduce MoleVers, a versatile pretrained model designed for various types of molecular property prediction in the wild, i.e., where experimentally-validated molecular property labels are scarce. MoleVers adopts a two-stage pretraining strategy. In the first stage, the model learns molecular representations from large unlabeled datasets via masked atom prediction and dynamic denoising, a novel task enabled by a new branching encoder architecture. In the second stage, MoleVers is further pretrained using auxiliary labels obtained with inexpensive computational methods, enabling supervised learning without the need for costly experimental data. This two-stage framework allows MoleVers to learn representations that generalize effectively across various downstream datasets. We evaluate MoleVers on a new benchmark comprising 22 molecular datasets with diverse types of properties, the majority of which contain 50 or fewer training labels reflecting real-world conditions. MoleVers achieves state-of-the-art results on 20 out of the 22 datasets, and ranks second among the remaining two, highlighting its ability to bridge the gap between data-hungry models and real-world conditions where practically-useful labels are scarce. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03537v1-abstract-full').style.display = 'none'; document.getElementById('2411.03537v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02793">arXiv:2411.02793</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02793">pdf</a>, <a href="https://arxiv.org/format/2411.02793">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Toward Robust Incomplete Multimodal Sentiment Analysis via Hierarchical Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingcheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dingkang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shunli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuaibing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jinjie Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yue Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Q">Qingyao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+X">Xiaolu Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingyang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Z">Ziyun Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Kou%2C+D">Dongliang Kou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lihua Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02793v1-abstract-short" style="display: inline;"> Multimodal Sentiment Analysis (MSA) is an important research area that aims to understand and recognize human sentiment through multiple modalities. The complementary information provided by multimodal fusion promotes better sentiment analysis compared to utilizing only a single modality. Nevertheless, in real-world applications, many unavoidable factors may lead to situations of uncertain modalit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02793v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02793v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02793v1-abstract-full" style="display: none;"> Multimodal Sentiment Analysis (MSA) is an important research area that aims to understand and recognize human sentiment through multiple modalities. The complementary information provided by multimodal fusion promotes better sentiment analysis compared to utilizing only a single modality. Nevertheless, in real-world applications, many unavoidable factors may lead to situations of uncertain modality missing, thus hindering the effectiveness of multimodal modeling and degrading the model&#39;s performance. To this end, we propose a Hierarchical Representation Learning Framework (HRLF) for the MSA task under uncertain missing modalities. Specifically, we propose a fine-grained representation factorization module that sufficiently extracts valuable sentiment information by factorizing modality into sentiment-relevant and modality-specific representations through crossmodal translation and sentiment semantic reconstruction. Moreover, a hierarchical mutual information maximization mechanism is introduced to incrementally maximize the mutual information between multi-scale representations to align and reconstruct the high-level semantics in the representations. Ultimately, we propose a hierarchical adversarial learning mechanism that further aligns and adapts the latent distribution of sentiment-relevant representations to produce robust joint multimodal representations. Comprehensive experiments on three datasets demonstrate that HRLF significantly improves MSA performance under uncertain modality missing cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02793v1-abstract-full').style.display = 'none'; document.getElementById('2411.02793v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02547">arXiv:2411.02547</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02547">pdf</a>, <a href="https://arxiv.org/format/2411.02547">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modeling Uncertainty in 3D Gaussian Splatting through Continuous Semantic Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wilson%2C+J">Joey Wilson</a>, <a href="/search/cs?searchtype=author&amp;query=Almeida%2C+M">Marcelino Almeida</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Min Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Mahajan%2C+S">Sachit Mahajan</a>, <a href="/search/cs?searchtype=author&amp;query=Ghaffari%2C+M">Maani Ghaffari</a>, <a href="/search/cs?searchtype=author&amp;query=Ewen%2C+P">Parker Ewen</a>, <a href="/search/cs?searchtype=author&amp;query=Ghasemalizadeh%2C+O">Omid Ghasemalizadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+C">Cheng-Hao Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Sen%2C+A">Arnie Sen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02547v1-abstract-short" style="display: inline;"> In this paper, we present a novel algorithm for probabilistically updating and rasterizing semantic maps within 3D Gaussian Splatting (3D-GS). Although previous methods have introduced algorithms which learn to rasterize features in 3D-GS for enhanced scene understanding, 3D-GS can fail without warning which presents a challenge for safety-critical robotic applications. To address this gap, we pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02547v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02547v1-abstract-full" style="display: none;"> In this paper, we present a novel algorithm for probabilistically updating and rasterizing semantic maps within 3D Gaussian Splatting (3D-GS). Although previous methods have introduced algorithms which learn to rasterize features in 3D-GS for enhanced scene understanding, 3D-GS can fail without warning which presents a challenge for safety-critical robotic applications. To address this gap, we propose a method which advances the literature of continuous semantic mapping from voxels to ellipsoids, combining the precise structure of 3D-GS with the ability to quantify uncertainty of probabilistic robotic maps. Given a set of images, our algorithm performs a probabilistic semantic update directly on the 3D ellipsoids to obtain an expectation and variance through the use of conjugate priors. We also propose a probabilistic rasterization which returns per-pixel segmentation predictions with quantifiable uncertainty. We compare our method with similar probabilistic voxel-based methods to verify our extension to 3D ellipsoids, and perform ablation studies on uncertainty quantification and temporal smoothing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02547v1-abstract-full').style.display = 'none'; document.getElementById('2411.02547v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02335">arXiv:2411.02335</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02335">pdf</a>, <a href="https://arxiv.org/format/2411.02335">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Sparsing Law: Towards Large Language Models with Greater Activation Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yuqi Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+C">Chenyang Song</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yingfa Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02335v1-abstract-short" style="display: inline;"> Activation sparsity denotes the existence of substantial weakly-contributed elements within activation outputs that can be eliminated, benefiting many important applications concerned with large language models (LLMs). Although promoting greater activation sparsity within LLMs deserves deep studies, existing works lack comprehensive and quantitative research on the correlation between activation s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02335v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02335v1-abstract-full" style="display: none;"> Activation sparsity denotes the existence of substantial weakly-contributed elements within activation outputs that can be eliminated, benefiting many important applications concerned with large language models (LLMs). Although promoting greater activation sparsity within LLMs deserves deep studies, existing works lack comprehensive and quantitative research on the correlation between activation sparsity and potentially influential factors. In this paper, we present a comprehensive study on the quantitative scaling properties and influential factors of the activation sparsity within decoder-only Transformer-based LLMs. Specifically, we propose PPL-$p\%$ sparsity, a precise and performance-aware activation sparsity metric that is applicable to any activation function. Through extensive experiments, we find several important phenomena. Firstly, different activation functions exhibit comparable performance but opposite training-time sparsity trends. The activation ratio (i.e., $1-\mathrm{sparsity\ ratio}$) evolves as a convergent increasing power-law and decreasing logspace power-law with the amount of training data for SiLU-activated and ReLU-activated LLMs, respectively. These demonstrate that ReLU is more efficient as the activation function than SiLU and can leverage more training data to improve activation sparsity. Secondly, the activation ratio linearly increases with the width-depth ratio below a certain bottleneck point, indicating the potential advantage of a deeper architecture at a fixed parameter scale. Finally, at similar width-depth ratios, we surprisingly find that the limit value of activation sparsity varies weakly with the parameter scale, i.e., the activation patterns within LLMs are insensitive to the parameter scale. These empirical laws towards LLMs with greater activation sparsity have important implications for making LLMs more efficient and interpretable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02335v1-abstract-full').style.display = 'none'; document.getElementById('2411.02335v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 13 figures, 6 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01217">arXiv:2411.01217</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01217">pdf</a>, <a href="https://arxiv.org/format/2411.01217">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Preference-CFR$\:$ Beyond Nash Equilibrium for Better Game Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ju%2C+Q">Qi Ju</a>, <a href="/search/cs?searchtype=author&amp;query=Tellier%2C+T">Thomas Tellier</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Meng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Z">Zhemei Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yunfeng Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01217v1-abstract-short" style="display: inline;"> Recent advancements in artificial intelligence (AI) have leveraged large-scale games as benchmarks to gauge progress, with AI now frequently outperforming human capabilities. Traditionally, this success has largely relied on solving Nash equilibrium (NE) using variations of the counterfactual regret minimization (CFR) method in games with incomplete information. However, the variety of Nash equili&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01217v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01217v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01217v1-abstract-full" style="display: none;"> Recent advancements in artificial intelligence (AI) have leveraged large-scale games as benchmarks to gauge progress, with AI now frequently outperforming human capabilities. Traditionally, this success has largely relied on solving Nash equilibrium (NE) using variations of the counterfactual regret minimization (CFR) method in games with incomplete information. However, the variety of Nash equilibria has been largely overlooked in previous research, limiting the adaptability of AI to meet diverse human preferences. To address this challenge, where AI is powerful but struggles to meet customization needs, we introduce a novel approach: Preference-CFR, which incorporates two new parameters: preference degree and vulnerability degree. These parameters allow for greater flexibility in AI strategy development without compromising convergence. Our method significantly alters the distribution of final strategies, enabling the creation of customized AI models that better align with individual user needs. Using Texas Hold&#39;em as a case study, our experiments demonstrate how Preference CFR can be adjusted to either emphasize customization, prioritizing user preferences, or to enhance performance, striking a balance between the depth of customization and strategic optimality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01217v1-abstract-full').style.display = 'none'; document.getElementById('2411.01217v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00341">arXiv:2411.00341</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00341">pdf</a>, <a href="https://arxiv.org/format/2411.00341">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Bundle Recommendation: Methods, Applications, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Meng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Lin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+X">Xiaohui Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peipei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J+X">Jimmy Xiangji Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00341v1-abstract-short" style="display: inline;"> In recent years, bundle recommendation systems have gained significant attention in both academia and industry due to their ability to enhance user experience and increase sales by recommending a set of items as a bundle rather than individual items. This survey provides a comprehensive review on bundle recommendation, beginning by a taxonomy for exploring product bundling. We classify it into two&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00341v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00341v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00341v1-abstract-full" style="display: none;"> In recent years, bundle recommendation systems have gained significant attention in both academia and industry due to their ability to enhance user experience and increase sales by recommending a set of items as a bundle rather than individual items. This survey provides a comprehensive review on bundle recommendation, beginning by a taxonomy for exploring product bundling. We classify it into two categories based on bundling strategy from various application domains, i.e., discriminative and generative bundle recommendation. Then we formulate the corresponding tasks of the two categories and systematically review their methods: 1) representation learning from bundle and item levels and interaction modeling for discriminative bundle recommendation; 2) representation learning from item level and bundle generation for generative bundle recommendation. Subsequently, we survey the resources of bundle recommendation including datasets and evaluation metrics, and conduct reproducibility experiments on mainstream models. Lastly, we discuss the main challenges and highlight the promising future directions in the field of bundle recommendation, aiming to serve as a useful resource for researchers and practitioners. Our code and datasets are publicly available at https://github.com/WUT-IDEA/bundle-recommendation-survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00341v1-abstract-full').style.display = 'none'; document.getElementById('2411.00341v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22721">arXiv:2410.22721</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22721">pdf</a>, <a href="https://arxiv.org/format/2410.22721">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Community search signatures as foundation features for human-centered geospatial modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mimi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Kamath%2C+C">Chaitanya Kamath</a>, <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+M">Mohit Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Muslim%2C+A">Arbaaz Muslim</a>, <a href="/search/cs?searchtype=author&amp;query=Yee%2C+H">Hector Yee</a>, <a href="/search/cs?searchtype=author&amp;query=Schottlander%2C+D">David Schottlander</a>, <a href="/search/cs?searchtype=author&amp;query=Bavadekar%2C+S">Shailesh Bavadekar</a>, <a href="/search/cs?searchtype=author&amp;query=Efron%2C+N">Niv Efron</a>, <a href="/search/cs?searchtype=author&amp;query=Shetty%2C+S">Shravya Shetty</a>, <a href="/search/cs?searchtype=author&amp;query=Prasad%2C+G">Gautam Prasad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22721v1-abstract-short" style="display: inline;"> Aggregated relative search frequencies offer a unique composite signal reflecting people&#39;s habits, concerns, interests, intents, and general information needs, which are not found in other readily available datasets. Temporal search trends have been successfully used in time series modeling across a variety of domains such as infectious diseases, unemployment rates, and retail sales. However, most&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22721v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22721v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22721v1-abstract-full" style="display: none;"> Aggregated relative search frequencies offer a unique composite signal reflecting people&#39;s habits, concerns, interests, intents, and general information needs, which are not found in other readily available datasets. Temporal search trends have been successfully used in time series modeling across a variety of domains such as infectious diseases, unemployment rates, and retail sales. However, most existing applications require curating specialized datasets of individual keywords, queries, or query clusters, and the search data need to be temporally aligned with the outcome variable of interest. We propose a novel approach for generating an aggregated and anonymized representation of search interest as foundation features at the community level for geospatial modeling. We benchmark these features using spatial datasets across multiple domains. In zip codes with a population greater than 3000 that cover over 95% of the contiguous US population, our models for predicting missing values in a 20% set of holdout counties achieve an average $R^2$ score of 0.74 across 21 health variables, and 0.80 across 6 demographic and environmental variables. Our results demonstrate that these search features can be used for spatial predictions without strict temporal alignment, and that the resulting models outperform spatial interpolation and state of the art methods using satellite imagery features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22721v1-abstract-full').style.display = 'none'; document.getElementById('2410.22721v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, presented at the DMLR workshop at ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18626">arXiv:2410.18626</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18626">pdf</a>, <a href="https://arxiv.org/format/2410.18626">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SAMG: State-Action-Aware Offline-to-Online Reinforcement Learning with Offline Model Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Haochi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+X">Xu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+Q">Quan Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruilong Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingyang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18626v1-abstract-short" style="display: inline;"> The offline-to-online (O2O) paradigm in reinforcement learning (RL) utilizes pre-trained models on offline datasets for subsequent online fine-tuning. However, conventional O2O RL algorithms typically require maintaining and retraining the large offline datasets to mitigate the effects of out-of-distribution (OOD) data, which limits their efficiency in exploiting online samples. To address this ch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18626v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18626v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18626v1-abstract-full" style="display: none;"> The offline-to-online (O2O) paradigm in reinforcement learning (RL) utilizes pre-trained models on offline datasets for subsequent online fine-tuning. However, conventional O2O RL algorithms typically require maintaining and retraining the large offline datasets to mitigate the effects of out-of-distribution (OOD) data, which limits their efficiency in exploiting online samples. To address this challenge, we introduce a new paradigm called SAMG: State-Action-Conditional Offline-to-Online Reinforcement Learning with Offline Model Guidance. In particular, rather than directly training on offline data, SAMG freezes the pre-trained offline critic to provide offline values for each state-action pair to deliver compact offline information. This framework eliminates the need for retraining with offline data by freezing and leveraging these values of the offline model. These are then incorporated with the online target critic using a Bellman equation weighted by a policy state-action-aware coefficient. This coefficient, derived from a conditional variational auto-encoder (C-VAE), aims to capture the reliability of the offline data on a state-action level. SAMG could be easily integrated with existing Q-function based O2O RL algorithms. Theoretical analysis shows good optimality and lower estimation error of SAMG. Empirical evaluations demonstrate that SAMG outperforms four state-of-the-art O2O RL algorithms in the D4RL benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18626v1-abstract-full').style.display = 'none'; document.getElementById('2410.18626v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17584">arXiv:2410.17584</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17584">pdf</a>, <a href="https://arxiv.org/format/2410.17584">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring Tokenization Methods for Multitrack Sheet Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yashan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xingjian Du</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17584v1-abstract-short" style="display: inline;"> This study explores the tokenization of multitrack sheet music in ABC notation, introducing two methods--bar-stream and line-stream patching. We compare these methods against existing techniques, including bar patching, byte patching, and Byte Pair Encoding (BPE). In terms of both computational efficiency and the musicality of the generated compositions, experimental results show that bar-stream p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17584v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17584v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17584v1-abstract-full" style="display: none;"> This study explores the tokenization of multitrack sheet music in ABC notation, introducing two methods--bar-stream and line-stream patching. We compare these methods against existing techniques, including bar patching, byte patching, and Byte Pair Encoding (BPE). In terms of both computational efficiency and the musicality of the generated compositions, experimental results show that bar-stream patching performs best overall compared to the others, which makes it a promising tokenization strategy for sheet music generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17584v1-abstract-full').style.display = 'none'; document.getElementById('2410.17584v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 1 figure, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15633">arXiv:2410.15633</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15633">pdf</a>, <a href="https://arxiv.org/format/2410.15633">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Selecting Influential Samples for Long Context Alignment via Homologous Models&#39; Guidance and Contextual Awareness Measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Si%2C+S">Shuzheng Si</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+K">Kangyang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+C">Chuancheng Lv</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+K">Kaikai An</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+F">Fanchao Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+B">Baobao Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15633v1-abstract-short" style="display: inline;"> The expansion of large language models to effectively handle instructions with extremely long contexts has yet to be fully investigated. The primary obstacle lies in constructing a high-quality long instruction-following dataset devised for long context alignment. Existing studies have attempted to scale up the available data volume by synthesizing long instruction-following samples. However, indi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15633v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15633v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15633v1-abstract-full" style="display: none;"> The expansion of large language models to effectively handle instructions with extremely long contexts has yet to be fully investigated. The primary obstacle lies in constructing a high-quality long instruction-following dataset devised for long context alignment. Existing studies have attempted to scale up the available data volume by synthesizing long instruction-following samples. However, indiscriminately increasing the quantity of data without a well-defined strategy for ensuring data quality may introduce low-quality samples and restrict the final performance. To bridge this gap, we aim to address the unique challenge of long-context alignment, i.e., modeling the long-range dependencies for handling instructions and lengthy input contexts. We propose GATEAU, a novel framework designed to identify the influential and high-quality samples enriched with long-range dependency relations by utilizing crafted Homologous Models&#39; Guidance (HMG) and Contextual Awareness Measurement (CAM). Specifically, HMG attempts to measure the difficulty of generating corresponding responses due to the long-range dependencies, using the perplexity scores of the response from two homologous models with different context windows. Also, the role of CAM is to measure the difficulty of understanding the long input contexts due to long-range dependencies by evaluating whether the model&#39;s attention is focused on important segments. Built upon both proposed methods, we select the most challenging samples as the influential data to effectively frame the long-range dependencies, thereby achieving better performance of LLMs. Comprehensive experiments indicate that GATEAU effectively identifies samples enriched with long-range dependency relations and the model trained on these selected samples exhibits better instruction-following and long-context understanding capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15633v1-abstract-full').style.display = 'none'; document.getElementById('2410.15633v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13509">arXiv:2410.13509</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13509">pdf</a>, <a href="https://arxiv.org/format/2410.13509">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RAG-DDR: Optimizing Retrieval-Augmented Generation Using Differentiable Data Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xinze Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+S">Sen Mei</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zheni Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+G">Ge Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13509v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has proven its effectiveness in mitigating hallucinations in Large Language Models (LLMs) by retrieving knowledge from external resources. To adapt LLMs for RAG pipelines, current approaches use instruction tuning to optimize LLMs, improving their ability to utilize retrieved knowledge. This supervised fine-tuning (SFT) approach focuses on equipping LLMs to han&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13509v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13509v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13509v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has proven its effectiveness in mitigating hallucinations in Large Language Models (LLMs) by retrieving knowledge from external resources. To adapt LLMs for RAG pipelines, current approaches use instruction tuning to optimize LLMs, improving their ability to utilize retrieved knowledge. This supervised fine-tuning (SFT) approach focuses on equipping LLMs to handle diverse RAG tasks using different instructions. However, it trains RAG modules to overfit training signals and overlooks the varying data preferences among agents within the RAG system. In this paper, we propose a Differentiable Data Rewards (DDR) method, which end-to-end trains RAG systems by aligning data preferences between different RAG modules. DDR works by collecting the rewards to optimize each agent with a rollout method. This method prompts agents to sample some potential responses as perturbations, evaluates the impact of these perturbations on the whole RAG system, and subsequently optimizes the agent to produce outputs that improve the performance of the RAG system. Our experiments on various knowledge-intensive tasks demonstrate that DDR significantly outperforms the SFT method, particularly for LLMs with smaller-scale parameters that depend more on the retrieved knowledge. Additionally, DDR exhibits a stronger capability to align the data preference between RAG modules. The DDR method makes generation module more effective in extracting key information from documents and mitigating conflicts between parametric memory and external knowledge. All codes are available at https://github.com/OpenMatch/RAG-DDR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13509v1-abstract-full').style.display = 'none'; document.getElementById('2410.13509v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13267">arXiv:2410.13267</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13267">pdf</a>, <a href="https://arxiv.org/format/2410.13267">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLaMP 2: Multimodal Music Information Retrieval Across 101 Languages Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yashan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhancheng Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+M">Monan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+X">Xuefeng Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yuejie Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuanliang Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiafeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaobing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13267v1-abstract-short" style="display: inline;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13267v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13267v1-abstract-full" style="display: none;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (Musical Instrument Digital Interface) for music information retrieval. CLaMP 2, pre-trained on 1.5 million ABC-MIDI-text triplets, includes a multilingual text encoder and a multimodal music encoder aligned via contrastive learning. By leveraging large language models, we obtain refined and consistent multilingual descriptions at scale, significantly reducing textual noise and balancing language distribution. Our experiments show that CLaMP 2 achieves state-of-the-art results in both multilingual semantic search and music classification across modalities, thus establishing a new standard for inclusive and global music information retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'none'; document.getElementById('2410.13267v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12995">arXiv:2410.12995</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12995">pdf</a>, <a href="https://arxiv.org/format/2410.12995">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Configurable Embodied Data Generation for Class-Agnostic RGB-D Video Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Opipari%2C+A">Anthony Opipari</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnan%2C+A+K">Aravindhan K Krishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Gayaka%2C+S">Shreekant Gayaka</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Min Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+C">Cheng-Hao Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Sen%2C+A">Arnie Sen</a>, <a href="/search/cs?searchtype=author&amp;query=Jenkins%2C+O+C">Odest Chadwicke Jenkins</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12995v1-abstract-short" style="display: inline;"> This paper presents a method for generating large-scale datasets to improve class-agnostic video segmentation across robots with different form factors. Specifically, we consider the question of whether video segmentation models trained on generic segmentation data could be more effective for particular robot platforms if robot embodiment is factored into the data generation process. To answer thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12995v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12995v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12995v1-abstract-full" style="display: none;"> This paper presents a method for generating large-scale datasets to improve class-agnostic video segmentation across robots with different form factors. Specifically, we consider the question of whether video segmentation models trained on generic segmentation data could be more effective for particular robot platforms if robot embodiment is factored into the data generation process. To answer this question, a pipeline is formulated for using 3D reconstructions (e.g. from HM3DSem) to generate segmented videos that are configurable based on a robot&#39;s embodiment (e.g. sensor type, sensor placement, and illumination source). A resulting massive RGB-D video panoptic segmentation dataset (MVPd) is introduced for extensive benchmarking with foundation and video segmentation models, as well as to support embodiment-focused research in video segmentation. Our experimental findings demonstrate that using MVPd for finetuning can lead to performance improvements when transferring foundation models to certain robot embodiments, such as specific camera placements. These experiments also show that using 3D modalities (depth images and camera pose) can lead to improvements in video segmentation accuracy and consistency. The project webpage is available at https://topipari.com/projects/MVPd <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12995v1-abstract-full').style.display = 'none'; document.getElementById('2410.12995v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in IEEE Robotics and Automation Letters October 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12361">arXiv:2410.12361</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12361">pdf</a>, <a href="https://arxiv.org/format/2410.12361">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Proactive Agent: Shifting LLM Agents from Reactive Responses to Active Assistance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaxi Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shenzhi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+C">Cheng Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guirong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Q">Qinyu Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yesai Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Huadong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cong%2C+X">Xin Cong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Weiwen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yasheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fangming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12361v1-abstract-short" style="display: inline;"> Agents powered by large language models have shown remarkable abilities in solving complex tasks. However, most agent systems remain reactive, limiting their effectiveness in scenarios requiring foresight and autonomous decision-making. In this paper, we tackle the challenge of developing proactive agents capable of anticipating and initiating tasks without explicit human instructions. We propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12361v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12361v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12361v1-abstract-full" style="display: none;"> Agents powered by large language models have shown remarkable abilities in solving complex tasks. However, most agent systems remain reactive, limiting their effectiveness in scenarios requiring foresight and autonomous decision-making. In this paper, we tackle the challenge of developing proactive agents capable of anticipating and initiating tasks without explicit human instructions. We propose a novel data-driven approach for this problem. Firstly, we collect real-world human activities to generate proactive task predictions. These predictions are then labeled by human annotators as either accepted or rejected. The labeled data is used to train a reward model that simulates human judgment and serves as an automatic evaluator of the proactiveness of LLM agents. Building on this, we develop a comprehensive data generation pipeline to create a diverse dataset, ProactiveBench, containing 6,790 events. Finally, we demonstrate that fine-tuning models with the proposed ProactiveBench can significantly elicit the proactiveness of LLM agents. Experimental results show that our fine-tuned model achieves an F1-Score of 66.47% in proactively offering assistance, outperforming all open-source and close-source models. These results highlight the potential of our method in creating more proactive and effective agent systems, paving the way for future advancements in human-agent collaboration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12361v1-abstract-full').style.display = 'none'; document.getElementById('2410.12361v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11551">arXiv:2410.11551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11551">pdf</a>, <a href="https://arxiv.org/format/2410.11551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LoKO: Low-Rank Kalman Optimizer for Online Fine-Tuning of Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdi%2C+H">Hossein Abdi</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingfei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+A">Andi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Kaski%2C+S">Samuel Kaski</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+W">Wei Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11551v1-abstract-short" style="display: inline;"> Training large models with millions or even billions of parameters from scratch incurs substantial computational costs. Parameter Efficient Fine-Tuning (PEFT) methods, particularly Low-Rank Adaptation (LoRA), address this challenge by adapting only a reduced number of parameters to specific tasks with gradient-based optimizers. In this paper, we cast PEFT as an optimal filtering/state estimation p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11551v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11551v1-abstract-full" style="display: none;"> Training large models with millions or even billions of parameters from scratch incurs substantial computational costs. Parameter Efficient Fine-Tuning (PEFT) methods, particularly Low-Rank Adaptation (LoRA), address this challenge by adapting only a reduced number of parameters to specific tasks with gradient-based optimizers. In this paper, we cast PEFT as an optimal filtering/state estimation problem and present Low-Rank Kalman Optimizer (LoKO) to estimate the optimal trainable parameters in an online manner. We leverage the low-rank decomposition in LoRA to significantly reduce matrix sizes in Kalman iterations and further capitalize on a diagonal approximation of the covariance matrix to effectively decrease computational complexity from quadratic to linear in the number of trainable parameters. Moreover, we discovered that the initialization of the covariance matrix within the Kalman algorithm and the accurate estimation of the observation noise covariance are the keys in this formulation, and we propose robust approaches that work well across a vast range of well-established computer vision and language models. Our results show that LoKO converges with fewer iterations and yields better performance models compared to commonly used optimizers with LoRA in both image classifications and language tasks. Our study opens up the possibility of leveraging the Kalman filter as an effective optimizer for the online fine-tuning of large models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11551v1-abstract-full').style.display = 'none'; document.getElementById('2410.11551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11105">arXiv:2410.11105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11105">pdf</a>, <a href="https://arxiv.org/format/2410.11105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Solar and Stellar Astrophysics">astro-ph.SR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Emulators for stellar profiles in binary population modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Teng%2C+E">Elizabeth Teng</a>, <a href="/search/cs?searchtype=author&amp;query=Demir%2C+U">Ugur Demir</a>, <a href="/search/cs?searchtype=author&amp;query=Doctor%2C+Z">Zoheyr Doctor</a>, <a href="/search/cs?searchtype=author&amp;query=Srivastava%2C+P+M">Philipp M. Srivastava</a>, <a href="/search/cs?searchtype=author&amp;query=Lalvani%2C+S">Shamal Lalvani</a>, <a href="/search/cs?searchtype=author&amp;query=Kalogera%2C+V">Vicky Kalogera</a>, <a href="/search/cs?searchtype=author&amp;query=Katsaggelos%2C+A">Aggelos Katsaggelos</a>, <a href="/search/cs?searchtype=author&amp;query=Andrews%2C+J+J">Jeff J. Andrews</a>, <a href="/search/cs?searchtype=author&amp;query=Bavera%2C+S+S">Simone S. Bavera</a>, <a href="/search/cs?searchtype=author&amp;query=Briel%2C+M+M">Max M. Briel</a>, <a href="/search/cs?searchtype=author&amp;query=Gossage%2C+S">Seth Gossage</a>, <a href="/search/cs?searchtype=author&amp;query=Kovlakas%2C+K">Konstantinos Kovlakas</a>, <a href="/search/cs?searchtype=author&amp;query=Kruckow%2C+M+U">Matthias U. Kruckow</a>, <a href="/search/cs?searchtype=author&amp;query=Rocha%2C+K+A">Kyle Akira Rocha</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Meng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Z">Zepei Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Zapartas%2C+E">Emmanouil Zapartas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11105v1-abstract-short" style="display: inline;"> Knowledge about the internal physical structure of stars is crucial to understanding their evolution. The novel binary population synthesis code POSYDON includes a module for interpolating the stellar and binary properties of any system at the end of binary MESA evolution based on a pre-computed set of models. In this work, we present a new emulation method for predicting stellar profiles, i.e., t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11105v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11105v1-abstract-full" style="display: none;"> Knowledge about the internal physical structure of stars is crucial to understanding their evolution. The novel binary population synthesis code POSYDON includes a module for interpolating the stellar and binary properties of any system at the end of binary MESA evolution based on a pre-computed set of models. In this work, we present a new emulation method for predicting stellar profiles, i.e., the internal stellar structure along the radial axis, using machine learning techniques. We use principal component analysis for dimensionality reduction and fully-connected feed-forward neural networks for making predictions. We find accuracy to be comparable to that of nearest neighbor approximation, with a strong advantage in terms of memory and storage efficiency. By delivering more information about the evolution of stellar internal structure, these emulators will enable faster simulations of higher physical fidelity with large-scale simulations of binary star population synthesis possible with POSYDON and other population synthesis codes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11105v1-abstract-full').style.display = 'none'; document.getElementById('2410.11105v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures. Submitted to Astronomy and Computing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11019">arXiv:2410.11019</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11019">pdf</a>, <a href="https://arxiv.org/format/2410.11019">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ET-Former: Efficient Triplane Deformable Attention for 3D Semantic Scene Completion From Monocular Camera </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jing Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+H">He Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+X">Xuewei Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J+J">Jong Jin Park</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Min Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Madhivanan%2C+R">Rajasimman Madhivanan</a>, <a href="/search/cs?searchtype=author&amp;query=Manocha%2C+D">Dinesh Manocha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11019v1-abstract-short" style="display: inline;"> We introduce ET-Former, a novel end-to-end algorithm for semantic scene completion using a single monocular camera. Our approach generates a semantic occupancy map from single RGB observation while simultaneously providing uncertainty estimates for semantic predictions. By designing a triplane-based deformable attention mechanism, our approach improves geometric understanding of the scene than oth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11019v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11019v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11019v1-abstract-full" style="display: none;"> We introduce ET-Former, a novel end-to-end algorithm for semantic scene completion using a single monocular camera. Our approach generates a semantic occupancy map from single RGB observation while simultaneously providing uncertainty estimates for semantic predictions. By designing a triplane-based deformable attention mechanism, our approach improves geometric understanding of the scene than other SOTA approaches and reduces noise in semantic predictions. Additionally, through the use of a Conditional Variational AutoEncoder (CVAE), we estimate the uncertainties of these predictions. The generated semantic and uncertainty maps will aid in the formulation of navigation strategies that facilitate safe and permissible decision-making in the future. Evaluated on the Semantic-KITTI dataset, ET-Former achieves the highest IoU and mIoU, surpassing other methods by 15.16% in IoU and 24.24% in mIoU, while reducing GPU memory usage of existing methods by 25%-50.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11019v1-abstract-full').style.display = 'none'; document.getElementById('2410.11019v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10594">arXiv:2410.10594</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10594">pdf</a>, <a href="https://arxiv.org/format/2410.10594">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chaoyue Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Bokai Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+J">Junbo Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Ran%2C+J">Junhao Ran</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10594v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) is an effective technique that enables large language models (LLMs) to utilize external knowledge sources for generation. However, current RAG systems are solely based on text, rendering it impossible to utilize vision information like layout and images that play crucial roles in real-world multi-modality documents. In this paper, we introduce VisRAG, which tac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10594v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10594v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10594v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) is an effective technique that enables large language models (LLMs) to utilize external knowledge sources for generation. However, current RAG systems are solely based on text, rendering it impossible to utilize vision information like layout and images that play crucial roles in real-world multi-modality documents. In this paper, we introduce VisRAG, which tackles this issue by establishing a vision-language model (VLM)-based RAG pipeline. In this pipeline, instead of first parsing the document to obtain text, the document is directly embedded using a VLM as an image and then retrieved to enhance the generation of a VLM. Compared to traditional text-based RAG, VisRAG maximizes the retention and utilization of the data information in the original documents, eliminating the information loss introduced during the parsing process. We collect both open-source and synthetic data to train the retriever in VisRAG and explore a variety of generation methods. Experiments demonstrate that VisRAG outperforms traditional RAG in both the retrieval and generation stages, achieving a 25--39\% end-to-end performance gain over traditional text-based RAG pipeline. Further analysis reveals that VisRAG is effective in utilizing training data and demonstrates strong generalization capability, positioning it as a promising solution for RAG on multi-modality documents. Our code and data are available at https://github.com/openbmb/visrag . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10594v1-abstract-full').style.display = 'none'; document.getElementById('2410.10594v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10538">arXiv:2410.10538</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10538">pdf</a>, <a href="https://arxiv.org/format/2410.10538">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Data-Driven Approaches for Modelling Target Behaviour </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Schlangen%2C+I">Isabel Schlangen</a>, <a href="/search/cs?searchtype=author&amp;query=Brandenburger%2C+A">Andr茅 Brandenburger</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengwei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Hopgood%2C+J+R">James R. Hopgood</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10538v1-abstract-short" style="display: inline;"> The performance of tracking algorithms strongly depends on the chosen model assumptions regarding the target dynamics. If there is a strong mismatch between the chosen model and the true object motion, the track quality may be poor or the track is easily lost. Still, the true dynamics might not be known a priori or it is too complex to be expressed in a tractable mathematical formulation. This pap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10538v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10538v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10538v1-abstract-full" style="display: none;"> The performance of tracking algorithms strongly depends on the chosen model assumptions regarding the target dynamics. If there is a strong mismatch between the chosen model and the true object motion, the track quality may be poor or the track is easily lost. Still, the true dynamics might not be known a priori or it is too complex to be expressed in a tractable mathematical formulation. This paper provides a comparative study between three different methods that use machine learning to describe the underlying object motion based on training data. The first method builds on Gaussian Processes (GPs) for predicting the object motion, the second learns the parameters of an Interacting Multiple Model (IMM) filter and the third uses a Long Short-Term Memory (LSTM) network as a motion model. All methods are compared against an Extended Kalman Filter (EKF) with an analytic motion model as a benchmark and their respective strengths are highlighted in one simulated and two real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10538v1-abstract-full').style.display = 'none'; document.getElementById('2410.10538v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures. Submitted to IEEE Transactions on Signal Processing on October 14, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09467">arXiv:2410.09467</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09467">pdf</a>, <a href="https://arxiv.org/format/2410.09467">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Single Image to 3D Generation using Gaussian Splatting and Hybrid Diffusion Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Basak%2C+H">Hritam Basak</a>, <a href="/search/cs?searchtype=author&amp;query=Tabatabaee%2C+H">Hadi Tabatabaee</a>, <a href="/search/cs?searchtype=author&amp;query=Gayaka%2C+S">Shreekant Gayaka</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Ming-Feng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+C">Cheng-Hao Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Sen%2C+A">Arnie Sen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Min Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Z">Zhaozheng Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09467v2-abstract-short" style="display: inline;"> 3D object generation from a single image involves estimating the full 3D geometry and texture of unseen views from an unposed RGB image captured in the wild. Accurately reconstructing an object&#39;s complete 3D structure and texture has numerous applications in real-world scenarios, including robotic manipulation, grasping, 3D scene understanding, and AR/VR. Recent advancements in 3D object generatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09467v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09467v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09467v2-abstract-full" style="display: none;"> 3D object generation from a single image involves estimating the full 3D geometry and texture of unseen views from an unposed RGB image captured in the wild. Accurately reconstructing an object&#39;s complete 3D structure and texture has numerous applications in real-world scenarios, including robotic manipulation, grasping, 3D scene understanding, and AR/VR. Recent advancements in 3D object generation have introduced techniques that reconstruct an object&#39;s 3D shape and texture by optimizing the efficient representation of Gaussian Splatting, guided by pre-trained 2D or 3D diffusion models. However, a notable disparity exists between the training datasets of these models, leading to distinct differences in their outputs. While 2D models generate highly detailed visuals, they lack cross-view consistency in geometry and texture. In contrast, 3D models ensure consistency across different views but often result in overly smooth textures. We propose bridging the gap between 2D and 3D diffusion models to address this limitation by integrating a two-stage frequency-based distillation loss with Gaussian Splatting. Specifically, we leverage geometric priors in the low-frequency spectrum from a 3D diffusion model to maintain consistent geometry and use a 2D diffusion model to refine the fidelity and texture in the high-frequency spectrum of the generated 3D structure, resulting in more detailed and fine-grained outcomes. Our approach enhances geometric consistency and visual quality, outperforming the current SOTA. Additionally, we demonstrate the easy adaptability of our method for efficient object pose estimation and tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09467v2-abstract-full').style.display = 'none'; document.getElementById('2410.09467v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09342">arXiv:2410.09342</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09342">pdf</a>, <a href="https://arxiv.org/format/2410.09342">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM$\times$MapReduce: Simplified Long-Sequence Processing using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zihan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chao%2C+Y">Yu Chao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhili Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+R">Rongqiao An</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Q">Qi Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Z">Zhixing Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+X">Xiaodong Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09342v1-abstract-short" style="display: inline;"> Enlarging the context window of large language models (LLMs) has become a crucial research area, particularly for applications involving extremely long texts. In this work, we propose a novel training-free framework for processing long texts, utilizing a divide-and-conquer strategy to achieve comprehensive document understanding. The proposed LLM$\times$MapReduce framework splits the entire docume&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09342v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09342v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09342v1-abstract-full" style="display: none;"> Enlarging the context window of large language models (LLMs) has become a crucial research area, particularly for applications involving extremely long texts. In this work, we propose a novel training-free framework for processing long texts, utilizing a divide-and-conquer strategy to achieve comprehensive document understanding. The proposed LLM$\times$MapReduce framework splits the entire document into several chunks for LLMs to read and then aggregates the intermediate answers to produce the final output. The main challenge for divide-and-conquer long text processing frameworks lies in the risk of losing essential long-range information when splitting the document, which can lead the model to produce incomplete or incorrect answers based on the segmented texts. Disrupted long-range information can be classified into two categories: inter-chunk dependency and inter-chunk conflict. We design a structured information protocol to better cope with inter-chunk dependency and an in-context confidence calibration mechanism to resolve inter-chunk conflicts. Experimental results demonstrate that LLM$\times$MapReduce can outperform representative open-source and commercial long-context LLMs, and is applicable to several different models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09342v1-abstract-full').style.display = 'none'; document.getElementById('2410.09342v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress. Code: https://github.com/thunlp/LLMxMapReduce</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08983">arXiv:2410.08983</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08983">pdf</a>, <a href="https://arxiv.org/format/2410.08983">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DEL: Discrete Element Learner for Learning 3D Particle Dynamics with Neural Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiaxu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jingkai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junhao He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Ziyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingyuan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08983v1-abstract-short" style="display: inline;"> Learning-based simulators show great potential for simulating particle dynamics when 3D groundtruth is available, but per-particle correspondences are not always accessible. The development of neural rendering presents a new solution to this field to learn 3D dynamics from 2D images by inverse rendering. However, existing approaches still suffer from ill-posed natures resulting from the 2D to 3D u&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08983v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08983v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08983v1-abstract-full" style="display: none;"> Learning-based simulators show great potential for simulating particle dynamics when 3D groundtruth is available, but per-particle correspondences are not always accessible. The development of neural rendering presents a new solution to this field to learn 3D dynamics from 2D images by inverse rendering. However, existing approaches still suffer from ill-posed natures resulting from the 2D to 3D uncertainty, for example, specific 2D images can correspond with various 3D particle distributions. To mitigate such uncertainty, we consider a conventional, mechanically interpretable framework as the physical priors and extend it to a learning-based version. In brief, we incorporate the learnable graph kernels into the classic Discrete Element Analysis (DEA) framework to implement a novel mechanics-integrated learning system. In this case, the graph network kernels are only used for approximating some specific mechanical operators in the DEA framework rather than the whole dynamics mapping. By integrating the strong physics priors, our methods can effectively learn the dynamics of various materials from the partial 2D observations in a unified manner. Experiments show that our approach outperforms other learned simulators by a large margin in this context and is robust to different renderers, fewer training samples, and fewer camera views. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08983v1-abstract-full').style.display = 'none'; document.getElementById('2410.08983v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08821">arXiv:2410.08821</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08821">pdf</a>, <a href="https://arxiv.org/format/2410.08821">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Retriever-and-Memory: Towards Adaptive Note-Enhanced Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruobing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zha%2C+D">Daren Zha</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Q">Qingfei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yixuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08821v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) mitigates issues of the factual errors and hallucinated outputs generated by Large Language Models (LLMs) in open-domain question-answering tasks (OpenQA) via introducing external knowledge. For complex QA, however, existing RAG methods use LLMs to actively predict retrieval timing and directly use the retrieved information for generation, regardless of whether&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08821v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08821v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08821v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) mitigates issues of the factual errors and hallucinated outputs generated by Large Language Models (LLMs) in open-domain question-answering tasks (OpenQA) via introducing external knowledge. For complex QA, however, existing RAG methods use LLMs to actively predict retrieval timing and directly use the retrieved information for generation, regardless of whether the retrieval timing accurately reflects the actual information needs, or sufficiently considers prior retrieved knowledge, which may result in insufficient information gathering and interaction, yielding low-quality answers. To address these, we propose a generic RAG approach called Adaptive Note-Enhanced RAG (Adaptive-Note) for complex QA tasks, which includes the iterative information collector, adaptive memory reviewer, and task-oriented generator, while following a new Retriever-and-Memory paradigm. Specifically, Adaptive-Note introduces an overarching view of knowledge growth, iteratively gathering new information in the form of notes and updating them into the existing optimal knowledge structure, enhancing high-quality knowledge interactions. In addition, we employ an adaptive, note-based stop-exploration strategy to decide &#34;what to retrieve and when to stop&#34; to encourage sufficient knowledge exploration. We conduct extensive experiments on five complex QA datasets, and the results demonstrate the superiority and effectiveness of our method and its components. The code and data are at https://github.com/thunlp/Adaptive-Note. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08821v1-abstract-full').style.display = 'none'; document.getElementById('2410.08821v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08530">arXiv:2410.08530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08530">pdf</a>, <a href="https://arxiv.org/format/2410.08530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Ego3DT: Tracking Every 3D Object in Ego-centric Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hao%2C+S">Shengyu Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhonghan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Meiqi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+W">Wendi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jieyang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yixian Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Gaoang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08530v1-abstract-short" style="display: inline;"> The growing interest in embodied intelligence has brought ego-centric perspectives to contemporary research. One significant challenge within this realm is the accurate localization and tracking of objects in ego-centric videos, primarily due to the substantial variability in viewing angles. Addressing this issue, this paper introduces a novel zero-shot approach for the 3D reconstruction and track&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08530v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08530v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08530v1-abstract-full" style="display: none;"> The growing interest in embodied intelligence has brought ego-centric perspectives to contemporary research. One significant challenge within this realm is the accurate localization and tracking of objects in ego-centric videos, primarily due to the substantial variability in viewing angles. Addressing this issue, this paper introduces a novel zero-shot approach for the 3D reconstruction and tracking of all objects from the ego-centric video. We present Ego3DT, a novel framework that initially identifies and extracts detection and segmentation information of objects within the ego environment. Utilizing information from adjacent video frames, Ego3DT dynamically constructs a 3D scene of the ego view using a pre-trained 3D scene reconstruction model. Additionally, we have innovated a dynamic hierarchical association mechanism for creating stable 3D tracking trajectories of objects in ego-centric videos. Moreover, the efficacy of our approach is corroborated by extensive experiments on two newly compiled datasets, with 1.04x - 2.90x in HOTA, showcasing the robustness and accuracy of our method in diverse ego-centric scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08530v1-abstract-full').style.display = 'none'; document.getElementById('2410.08530v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08115">arXiv:2410.08115</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08115">pdf</a>, <a href="https://arxiv.org/format/2410.08115">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Optima: Optimizing Effectiveness and Efficiency for LLM-Based Multi-Agent System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+J">Jiarui Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08115v1-abstract-short" style="display: inline;"> Large Language Model (LLM) based multi-agent systems (MAS) show remarkable potential in collaborative problem-solving, yet they still face critical challenges: low communication efficiency, poor scalability, and a lack of effective parameter-updating optimization methods. We present Optima, a novel framework that addresses these issues by significantly enhancing both communication efficiency and t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08115v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08115v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08115v1-abstract-full" style="display: none;"> Large Language Model (LLM) based multi-agent systems (MAS) show remarkable potential in collaborative problem-solving, yet they still face critical challenges: low communication efficiency, poor scalability, and a lack of effective parameter-updating optimization methods. We present Optima, a novel framework that addresses these issues by significantly enhancing both communication efficiency and task effectiveness in LLM-based MAS through LLM training. Optima employs an iterative generate, rank, select, and train paradigm with a reward function balancing task performance, token efficiency, and communication readability. We explore various RL algorithms, including Supervised Fine-Tuning, Direct Preference Optimization, and their hybrid approaches, providing insights into their effectiveness-efficiency trade-offs. We integrate Monte Carlo Tree Search-inspired techniques for DPO data generation, treating conversation turns as tree nodes to explore diverse interaction paths. Evaluated on common multi-agent tasks, including information-asymmetric question answering and complex reasoning, Optima shows consistent and substantial improvements over single-agent baselines and vanilla MAS based on Llama 3 8B, achieving up to 2.8x performance gain with less than 10\% tokens on tasks requiring heavy information exchange. Moreover, Optima&#39;s efficiency gains open new possibilities for leveraging inference-compute more effectively, leading to improved inference-time scaling laws. By addressing fundamental challenges in LLM-based MAS, Optima shows the potential towards scalable, efficient, and effective MAS (https://chenweize1998.github.io/optima-project-page). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08115v1-abstract-full').style.display = 'none'; document.getElementById('2410.08115v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07526">arXiv:2410.07526</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07526">pdf</a>, <a href="https://arxiv.org/format/2410.07526">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MKGL: Mastery of a Three-Word Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+L">Lingbing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Bo%2C+Z">Zhongpu Bo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiaoyan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+Y">Yarong Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yangyifei Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Huajun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07526v1-abstract-short" style="display: inline;"> Large language models (LLMs) have significantly advanced performance across a spectrum of natural language processing (NLP) tasks. Yet, their application to knowledge graphs (KGs), which describe facts in the form of triplets and allow minimal hallucinations, remains an underexplored frontier. In this paper, we investigate the integration of LLMs with KGs by introducing a specialized KG Language (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07526v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07526v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07526v1-abstract-full" style="display: none;"> Large language models (LLMs) have significantly advanced performance across a spectrum of natural language processing (NLP) tasks. Yet, their application to knowledge graphs (KGs), which describe facts in the form of triplets and allow minimal hallucinations, remains an underexplored frontier. In this paper, we investigate the integration of LLMs with KGs by introducing a specialized KG Language (KGL), where a sentence precisely consists of an entity noun, a relation verb, and ends with another entity noun. Despite KGL&#39;s unfamiliar vocabulary to the LLM, we facilitate its learning through a tailored dictionary and illustrative sentences, and enhance context understanding via real-time KG context retrieval and KGL token embedding augmentation. Our results reveal that LLMs can achieve fluency in KGL, drastically reducing errors compared to conventional KG embedding methods on KG completion. Furthermore, our enhanced LLM shows exceptional competence in generating accurate three-word sentences from an initial entity and interpreting new unseen terms out of KGs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07526v1-abstract-full').style.display = 'none'; document.getElementById('2410.07526v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 (spotlight)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07145">arXiv:2410.07145</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07145">pdf</a>, <a href="https://arxiv.org/format/2410.07145">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Stuffed Mamba: State Collapse and State Capacity of RNN-Based Long-Context Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yingfa Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinrong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengding Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07145v1-abstract-short" style="display: inline;"> One essential advantage of recurrent neural networks (RNNs) over transformer-based language models is their linear computational complexity concerning the sequence length, which makes them much faster in handling long sequences during inference. However, most publicly available RNNs (e.g., Mamba and RWKV) are trained on sequences with less than 10K tokens, and their effectiveness in longer context&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07145v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07145v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07145v1-abstract-full" style="display: none;"> One essential advantage of recurrent neural networks (RNNs) over transformer-based language models is their linear computational complexity concerning the sequence length, which makes them much faster in handling long sequences during inference. However, most publicly available RNNs (e.g., Mamba and RWKV) are trained on sequences with less than 10K tokens, and their effectiveness in longer contexts remains largely unsatisfying so far. In this paper, we study the cause of the inability to process long context for RNNs and suggest critical mitigations. We examine two practical concerns when applying state-of-the-art RNNs to long contexts: (1) the inability to extrapolate to inputs longer than the training length and (2) the upper bound of memory capacity. Addressing the first concern, we first investigate *state collapse* (SC), a phenomenon that causes severe performance degradation on sequence lengths not encountered during training. With controlled experiments, we attribute this to overfitting due to the recurrent state being overparameterized for the training length. For the second concern, we train a series of Mamba-2 models on long documents to empirically estimate the recurrent state capacity in language modeling and passkey retrieval. Then, three SC mitigation methods are proposed to improve Mamba-2&#39;s length generalizability, allowing the model to process more than 1M tokens without SC. We also find that the recurrent state capacity in passkey retrieval scales exponentially to the state size, and we empirically train a Mamba-2 370M with near-perfect passkey retrieval accuracy on 256K context length. This suggests a promising future for RNN-based long-context modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07145v1-abstract-full').style.display = 'none'; document.getElementById('2410.07145v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06581">arXiv:2410.06581</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06581">pdf</a>, <a href="https://arxiv.org/format/2410.06581">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Legal Case Retrieval via Scaling High-quality Synthetic Query-Candidate Pairs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Cheng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Huimin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06581v1-abstract-short" style="display: inline;"> Legal case retrieval (LCR) aims to provide similar cases as references for a given fact description. This task is crucial for promoting consistent judgments in similar cases, effectively enhancing judicial fairness and improving work efficiency for judges. However, existing works face two main challenges for real-world applications: existing works mainly focus on case-to-case retrieval using lengt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06581v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06581v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06581v1-abstract-full" style="display: none;"> Legal case retrieval (LCR) aims to provide similar cases as references for a given fact description. This task is crucial for promoting consistent judgments in similar cases, effectively enhancing judicial fairness and improving work efficiency for judges. However, existing works face two main challenges for real-world applications: existing works mainly focus on case-to-case retrieval using lengthy queries, which does not match real-world scenarios; and the limited data scale, with current datasets containing only hundreds of queries, is insufficient to satisfy the training requirements of existing data-hungry neural models. To address these issues, we introduce an automated method to construct synthetic query-candidate pairs and build the largest LCR dataset to date, LEAD, which is hundreds of times larger than existing datasets. This data construction method can provide ample training signals for LCR models. Experimental results demonstrate that model training with our constructed data can achieve state-of-the-art results on two widely-used LCR benchmarks. Besides, the construction method can also be applied to civil cases and achieve promising results. The data and codes can be found in https://github.com/thunlp/LEAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06581v1-abstract-full').style.display = 'none'; document.getElementById('2410.06581v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 3 figures, accepted by EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05639">arXiv:2410.05639</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05639">pdf</a>, <a href="https://arxiv.org/format/2410.05639">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DecorateLM: Data Engineering through Corpus Rating, Tagging, and Editing with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+R">Ranchi Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Thai%2C+Z+L">Zhen Leng Thai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengding Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Ba%2C+Y">Yunqi Ba</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+J">Jie Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05639v1-abstract-short" style="display: inline;"> The performance of Large Language Models (LLMs) is substantially influenced by the pretraining corpus, which consists of vast quantities of unsupervised data processed by the models. Despite its critical role in model performance, ensuring the quality of this data is challenging due to its sheer volume and the absence of sample-level quality annotations and enhancements. In this paper, we introduc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05639v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05639v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05639v1-abstract-full" style="display: none;"> The performance of Large Language Models (LLMs) is substantially influenced by the pretraining corpus, which consists of vast quantities of unsupervised data processed by the models. Despite its critical role in model performance, ensuring the quality of this data is challenging due to its sheer volume and the absence of sample-level quality annotations and enhancements. In this paper, we introduce DecorateLM, a data engineering method designed to refine the pretraining corpus through data rating, tagging and editing. Specifically, DecorateLM rates texts against quality criteria, tags texts with hierarchical labels, and edits texts into a more formalized format. Due to the massive size of the pretraining corpus, adopting an LLM for decorating the entire corpus is less efficient. Therefore, to balance performance with efficiency, we curate a meticulously annotated training corpus for DecorateLM using a large language model and distill data engineering expertise into a compact 1.2 billion parameter small language model (SLM). We then apply DecorateLM to enhance 100 billion tokens of the training corpus, selecting 45 billion tokens that exemplify high quality and diversity for the further training of another 1.2 billion parameter LLM. Our results demonstrate that employing such high-quality data can significantly boost model performance, showcasing a powerful approach to enhance the quality of the pretraining corpus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05639v1-abstract-full').style.display = 'none'; document.getElementById('2410.05639v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> EMNLP 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04283">arXiv:2410.04283</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04283">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Applying Hybrid Graph Neural Networks to Strengthen Credit Risk Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengfang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Wenying Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shaobo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+M">Mohan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhen Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04283v1-abstract-short" style="display: inline;"> This paper presents a novel approach to credit risk prediction by employing Graph Convolutional Neural Networks (GCNNs) to assess the creditworthiness of borrowers. Leveraging the power of big data and artificial intelligence, the proposed method addresses the challenges faced by traditional credit risk assessment models, particularly in handling imbalanced datasets and extracting meaningful featu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04283v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04283v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04283v1-abstract-full" style="display: none;"> This paper presents a novel approach to credit risk prediction by employing Graph Convolutional Neural Networks (GCNNs) to assess the creditworthiness of borrowers. Leveraging the power of big data and artificial intelligence, the proposed method addresses the challenges faced by traditional credit risk assessment models, particularly in handling imbalanced datasets and extracting meaningful features from complex relationships. The paper begins by transforming raw borrower data into graph-structured data, where borrowers and their relationships are represented as nodes and edges, respectively. A classic subgraph convolutional model is then applied to extract local features, followed by the introduction of a hybrid GCNN model that integrates both local and global convolutional operators to capture a comprehensive representation of node features. The hybrid model incorporates an attention mechanism to adaptively select features, mitigating issues of over-smoothing and insufficient feature consideration. The study demonstrates the potential of GCNNs in improving the accuracy of credit risk prediction, offering a robust solution for financial institutions seeking to enhance their lending decision-making processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04283v1-abstract-full').style.display = 'none'; document.getElementById('2410.04283v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04223">arXiv:2410.04223</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04223">pdf</a>, <a href="https://arxiv.org/format/2410.04223">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Large Language Models for Inverse Molecular Design with Retrosynthetic Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Gang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Michael Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Matusik%2C+W">Wojciech Matusik</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+M">Meng Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04223v1-abstract-short" style="display: inline;"> While large language models (LLMs) have integrated images, adapting them to graphs remains challenging, limiting their applications in materials and drug design. This difficulty stems from the need for coherent autoregressive generation across texts and graphs. To address this, we introduce Llamole, the first multimodal LLM capable of interleaved text and graph generation, enabling molecular inver&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04223v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04223v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04223v1-abstract-full" style="display: none;"> While large language models (LLMs) have integrated images, adapting them to graphs remains challenging, limiting their applications in materials and drug design. This difficulty stems from the need for coherent autoregressive generation across texts and graphs. To address this, we introduce Llamole, the first multimodal LLM capable of interleaved text and graph generation, enabling molecular inverse design with retrosynthetic planning. Llamole integrates a base LLM with the Graph Diffusion Transformer and Graph Neural Networks for multi-conditional molecular generation and reaction inference within texts, while the LLM, with enhanced molecular understanding, flexibly controls activation among the different graph modules. Additionally, Llamole integrates A* search with LLM-based cost functions for efficient retrosynthetic planning. We create benchmarking datasets and conduct extensive experiments to evaluate Llamole against in-context learning and supervised fine-tuning. Llamole significantly outperforms 14 adapted LLMs across 12 metrics for controllable molecular design and retrosynthetic planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04223v1-abstract-full').style.display = 'none'; document.getElementById('2410.04223v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 11 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03440">arXiv:2410.03440</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03440">pdf</a>, <a href="https://arxiv.org/format/2410.03440">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Benefit of Activation Sparsity in Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhengyan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Q">Qiujieli Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zhiyuan Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03440v1-abstract-short" style="display: inline;"> Pre-trained Transformers inherently possess the characteristic of sparse activation, where only a small fraction of the neurons are activated for each token. While sparse activation has been explored through post-training methods, its potential in pre-training remains untapped. In this work, we first study how activation properties change during pre-training. Our examination reveals that Transform&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03440v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03440v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03440v1-abstract-full" style="display: none;"> Pre-trained Transformers inherently possess the characteristic of sparse activation, where only a small fraction of the neurons are activated for each token. While sparse activation has been explored through post-training methods, its potential in pre-training remains untapped. In this work, we first study how activation properties change during pre-training. Our examination reveals that Transformers exhibit sparse activation throughout the majority of the pre-training process while the activation correlation keeps evolving as training progresses. Leveraging this observation, we propose Switchable Sparse-Dense Learning (SSD). SSD adaptively switches between the Mixtures-of-Experts (MoE) based sparse training and the conventional dense training during the pre-training process, leveraging the efficiency of sparse training and avoiding the static activation correlation of sparse training. Compared to dense training, SSD achieves comparable performance with identical model size and reduces pre-training costs. Moreover, the models trained with SSD can be directly used as MoE models for sparse inference and achieve the same performance as dense models with up to $2\times$ faster inference speed. Codes are available at https://github.com/thunlp/moefication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03440v1-abstract-full').style.display = 'none'; document.getElementById('2410.03440v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03421">arXiv:2410.03421</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03421">pdf</a>, <a href="https://arxiv.org/format/2410.03421">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> One2set + Large Language Model: Best Partners for Keyphrase Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shao%2C+L">Liangying Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+M">Minlong Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+G">Guoqi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+H">Hao Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingming Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+J">Jinsong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03421v2-abstract-short" style="display: inline;"> Keyphrase generation (KPG) aims to automatically generate a collection of phrases representing the core concepts of a given document. The dominant paradigms in KPG include one2seq and one2set. Recently, there has been increasing interest in applying large language models (LLMs) to KPG. Our preliminary experiments reveal that it is challenging for a single model to excel in both recall and precisio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03421v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03421v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03421v2-abstract-full" style="display: none;"> Keyphrase generation (KPG) aims to automatically generate a collection of phrases representing the core concepts of a given document. The dominant paradigms in KPG include one2seq and one2set. Recently, there has been increasing interest in applying large language models (LLMs) to KPG. Our preliminary experiments reveal that it is challenging for a single model to excel in both recall and precision. Further analysis shows that: 1) the one2set paradigm owns the advantage of high recall, but suffers from improper assignments of supervision signals during training; 2) LLMs are powerful in keyphrase selection, but existing selection methods often make redundant selections. Given these observations, we introduce a generate-then-select framework decomposing KPG into two steps, where we adopt a one2set-based model as generator to produce candidates and then use an LLM as selector to select keyphrases from these candidates. Particularly, we make two important improvements on our generator and selector: 1) we design an Optimal Transport-based assignment strategy to address the above improper assignments; 2) we model the keyphrase selection as a sequence labeling task to alleviate redundant selections. Experimental results on multiple benchmark datasets show that our framework significantly surpasses state-of-the-art models, especially in absent keyphrase prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03421v2-abstract-full').style.display = 'none'; document.getElementById('2410.03421v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02249">arXiv:2410.02249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02249">pdf</a>, <a href="https://arxiv.org/format/2410.02249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Spiking Neural Network as Adaptive Event Stream Slicer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jiahang Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingyuan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziqing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S">Shibo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Renjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02249v2-abstract-short" style="display: inline;"> Event-based cameras are attracting significant interest as they provide rich edge information, high dynamic range, and high temporal resolution. Many state-of-the-art event-based algorithms rely on splitting the events into fixed groups, resulting in the omission of crucial temporal information, particularly when dealing with diverse motion scenarios (\eg, high/low speed).In this work, we propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02249v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02249v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02249v2-abstract-full" style="display: none;"> Event-based cameras are attracting significant interest as they provide rich edge information, high dynamic range, and high temporal resolution. Many state-of-the-art event-based algorithms rely on splitting the events into fixed groups, resulting in the omission of crucial temporal information, particularly when dealing with diverse motion scenarios (\eg, high/low speed).In this work, we propose SpikeSlicer, a novel-designed plug-and-play event processing method capable of splitting events stream adaptively.SpikeSlicer utilizes a low-energy spiking neural network (SNN) to trigger event slicing. To guide the SNN to fire spikes at optimal time steps, we propose the Spiking Position-aware Loss (SPA-Loss) to modulate the neuron&#39;s state. Additionally, we develop a Feedback-Update training strategy that refines the slicing decisions using feedback from the downstream artificial neural network (ANN). Extensive experiments demonstrate that our method yields significant performance improvements in event-based object tracking and recognition. Notably, SpikeSlicer provides a brand-new SNN-ANN cooperation paradigm, where the SNN acts as an efficient, low-energy data processor to assist the ANN in improving downstream performance, injecting new perspectives and potential avenues of exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02249v2-abstract-full').style.display = 'none'; document.getElementById('2410.02249v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01718">arXiv:2410.01718</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01718">pdf</a>, <a href="https://arxiv.org/format/2410.01718">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> COMUNI: Decomposing Common and Unique Video Signals for Diffusion-based Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingzhen Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weining Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xinxin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01718v1-abstract-short" style="display: inline;"> Since videos record objects moving coherently, adjacent video frames have commonness (similar object appearances) and uniqueness (slightly changed postures). To prevent redundant modeling of common video signals, we propose a novel diffusion-based framework, named COMUNI, which decomposes the COMmon and UNIque video signals to enable efficient video generation. Our approach separates the decomposi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01718v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01718v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01718v1-abstract-full" style="display: none;"> Since videos record objects moving coherently, adjacent video frames have commonness (similar object appearances) and uniqueness (slightly changed postures). To prevent redundant modeling of common video signals, we propose a novel diffusion-based framework, named COMUNI, which decomposes the COMmon and UNIque video signals to enable efficient video generation. Our approach separates the decomposition of video signals from the task of video generation, thus reducing the computation complexity of generative models. In particular, we introduce CU-VAE to decompose video signals and encode them into latent features. To train CU-VAE in a self-supervised manner, we employ a cascading merge module to reconstitute video signals and a time-agnostic video decoder to reconstruct video frames. Then we propose CU-LDM to model latent features for video generation, which adopts two specific diffusion streams to simultaneously model the common and unique latent features. We further utilize additional joint modules for cross modeling of the common and unique latent features, and a novel position embedding method to ensure the content consistency and motion coherence of generated videos. The position embedding method incorporates spatial and temporal absolute position information into the joint modules. Extensive experiments demonstrate the necessity of decomposing common and unique video signals for video generation and the effectiveness and efficiency of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01718v1-abstract-full').style.display = 'none'; document.getElementById('2410.01718v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01594">arXiv:2410.01594</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01594">pdf</a>, <a href="https://arxiv.org/format/2410.01594">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MM-LDM: Multi-Modal Latent Diffusion Model for Sounding Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mingzhen Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weining Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yanyuan Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jiahui Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zihan Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+L">Longteng Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xinxin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01594v1-abstract-short" style="display: inline;"> Sounding Video Generation (SVG) is an audio-video joint generation task challenged by high-dimensional signal spaces, distinct data formats, and different patterns of content information. To address these issues, we introduce a novel multi-modal latent diffusion model (MM-LDM) for the SVG task. We first unify the representation of audio and video data by converting them into a single or a couple o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01594v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01594v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01594v1-abstract-full" style="display: none;"> Sounding Video Generation (SVG) is an audio-video joint generation task challenged by high-dimensional signal spaces, distinct data formats, and different patterns of content information. To address these issues, we introduce a novel multi-modal latent diffusion model (MM-LDM) for the SVG task. We first unify the representation of audio and video data by converting them into a single or a couple of images. Then, we introduce a hierarchical multi-modal autoencoder that constructs a low-level perceptual latent space for each modality and a shared high-level semantic feature space. The former space is perceptually equivalent to the raw signal space of each modality but drastically reduces signal dimensions. The latter space serves to bridge the information gap between modalities and provides more insightful cross-modal guidance. Our proposed method achieves new state-of-the-art results with significant quality and efficiency gains. Specifically, our method achieves a comprehensive improvement on all evaluation metrics and a faster training and sampling speed on Landscape and AIST++ datasets. Moreover, we explore its performance on open-domain sounding video generation, long sounding video generation, audio continuation, video continuation, and conditional single-modal generation tasks for a comprehensive evaluation, where our MM-LDM demonstrates exciting adaptability and generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01594v1-abstract-full').style.display = 'none'; document.getElementById('2410.01594v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00256">arXiv:2410.00256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00256">pdf</a>, <a href="https://arxiv.org/format/2410.00256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.23977/jaip.2024.070316">10.23977/jaip.2024.070316 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhanced Credit Score Prediction Using Ensemble Deep Learning Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Q">Qianwen Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sining Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Q">Qi Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+X">Xingyu Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengying Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00256v2-abstract-short" style="display: inline;"> In contemporary economic society, credit scores are crucial for every participant. A robust credit evaluation system is essential for the profitability of core businesses such as credit cards, loans, and investments for commercial banks and the financial sector. This paper combines high-performance models like XGBoost and LightGBM, already widely used in modern banking systems, with the powerful T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00256v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00256v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00256v2-abstract-full" style="display: none;"> In contemporary economic society, credit scores are crucial for every participant. A robust credit evaluation system is essential for the profitability of core businesses such as credit cards, loans, and investments for commercial banks and the financial sector. This paper combines high-performance models like XGBoost and LightGBM, already widely used in modern banking systems, with the powerful TabNet model. We have developed a potent model capable of accurately determining credit score levels by integrating Random Forest, XGBoost, and TabNet, and through the stacking technique in ensemble modeling. This approach surpasses the limitations of single models and significantly advances the precise credit score prediction. In the following sections, we will explain the techniques we used and thoroughly validate our approach by comprehensively comparing a series of metrics such as Precision, Recall, F1, and AUC. By integrating Random Forest, XGBoost, and with the TabNet deep learning architecture, these models complement each other, demonstrating exceptionally strong overall performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00256v2-abstract-full').style.display = 'none'; document.getElementById('2410.00256v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper have been accepted by sci of AI Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19667">arXiv:2409.19667</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19667">pdf</a>, <a href="https://arxiv.org/format/2409.19667">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Analyze Graphs like Professionals? A Benchmark, Datasets and Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+Q">Qizhi Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haopeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhaojun Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ran Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yiwei Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+C">Chuan Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Cheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19667v2-abstract-short" style="display: inline;"> The need to analyze graphs is ubiquitous across various fields, from social networks to biological research and recommendation systems. Therefore, enabling the ability of large language models (LLMs) to process graphs is an important step toward more advanced general intelligence. However, current LLM benchmarks on graph analysis require models to directly reason over the prompts describing graph&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19667v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19667v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19667v2-abstract-full" style="display: none;"> The need to analyze graphs is ubiquitous across various fields, from social networks to biological research and recommendation systems. Therefore, enabling the ability of large language models (LLMs) to process graphs is an important step toward more advanced general intelligence. However, current LLM benchmarks on graph analysis require models to directly reason over the prompts describing graph topology, and are thus limited to small graphs with only a few dozens of nodes. In contrast, human experts typically write programs based on popular libraries for task solving, and can thus handle graphs with different scales. To this end, a question naturally arises: can LLMs analyze graphs like professionals? In this paper, we introduce ProGraph, a manually crafted benchmark containing 3 categories of graph tasks. The benchmark expects solutions based on programming instead of directly reasoning over raw inputs. Our findings reveal that the performance of current LLMs is unsatisfactory, with the best model achieving only 36% accuracy. To bridge this gap, we propose LLM4Graph datasets, which include crawled documents and auto-generated codes based on 6 widely used graph libraries. By augmenting closed-source LLMs with document retrieval and fine-tuning open-source ones on the codes, we show 11-32% absolute improvements in their accuracies. Our results underscore that the capabilities of LLMs in handling structured data are still under-explored, and show the effectiveness of LLM4Graph in enhancing LLMs&#39; proficiency of graph analysis. The benchmark, datasets and enhanced open-source models are available at https://github.com/BUPT-GAMMA/ProGraph. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19667v2-abstract-full').style.display = 'none'; document.getElementById('2409.19667v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14010">arXiv:2409.14010</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14010">pdf</a>, <a href="https://arxiv.org/format/2409.14010">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> RRD-Bio: Building An Integrated Research Resource Database for Biomedicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengting Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+C">Chong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Haihua Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14010v1-abstract-short" style="display: inline;"> Research resources (RRs) such as data, software, and tools are essential pillars of scientific research. The field of biomedicine, a critical scientific discipline, is witnessing a surge in research publications resulting in the accumulation of a substantial number of RRs. However, these resources are dispersed among various biomedical articles and can be challenging to locate and reuse due to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14010v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14010v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14010v1-abstract-full" style="display: none;"> Research resources (RRs) such as data, software, and tools are essential pillars of scientific research. The field of biomedicine, a critical scientific discipline, is witnessing a surge in research publications resulting in the accumulation of a substantial number of RRs. However, these resources are dispersed among various biomedical articles and can be challenging to locate and reuse due to their transient nature. In this paper, we report our recent progress in biomedical data curation - building a large research resource database for biomedicine (RRD-Bio), based on a collection of 40 million papers from two large biomedical literature databases, PubMed and PubMed Central. The database contains 2,555,116 RRs, each identified by a location on the Internet (URL) and descriptive information (Context). We made the RRD-Bio database publicly available (\url{https://zenodo.org/records/10526493}) to enhance the visibility of biomedical research resources, the ability to preserve important resources and the reproducibility of biomedical research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14010v1-abstract-full').style.display = 'none'; document.getElementById('2409.14010v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13731">arXiv:2409.13731</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13731">pdf</a>, <a href="https://arxiv.org/format/2409.13731">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KAG: Boosting LLMs in Professional Domains via Knowledge Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+L">Lei Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+Z">Zhengke Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhongshu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhouyu Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+L">Ling Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+Y">Yuan Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+P">Peilong Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Bo%2C+Z">Zhongpu Bo</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Huaidong Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Lin Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zaoyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Huajun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wenguang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13731v3-abstract-short" style="display: inline;"> The recently developed retrieval-augmented generation (RAG) technology has enabled the efficient construction of domain-specific applications. However, it also has limitations, including the gap between vector similarity and the relevance of knowledge reasoning, as well as insensitivity to knowledge logic, such as numerical values, temporal relations, expert rules, and others, which hinder the eff&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13731v3-abstract-full').style.display = 'inline'; document.getElementById('2409.13731v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13731v3-abstract-full" style="display: none;"> The recently developed retrieval-augmented generation (RAG) technology has enabled the efficient construction of domain-specific applications. However, it also has limitations, including the gap between vector similarity and the relevance of knowledge reasoning, as well as insensitivity to knowledge logic, such as numerical values, temporal relations, expert rules, and others, which hinder the effectiveness of professional knowledge services. In this work, we introduce a professional domain knowledge service framework called Knowledge Augmented Generation (KAG). KAG is designed to address the aforementioned challenges with the motivation of making full use of the advantages of knowledge graph(KG) and vector retrieval, and to improve generation and reasoning performance by bidirectionally enhancing large language models (LLMs) and KGs through five key aspects: (1) LLM-friendly knowledge representation, (2) mutual-indexing between knowledge graphs and original chunks, (3) logical-form-guided hybrid reasoning engine, (4) knowledge alignment with semantic reasoning, and (5) model capability enhancement for KAG. We compared KAG with existing RAG methods in multihop question answering and found that it significantly outperforms state-of-theart methods, achieving a relative improvement of 19.6% on 2wiki and 33.5% on hotpotQA in terms of F1 score. We have successfully applied KAG to two professional knowledge Q&amp;A tasks of Ant Group, including E-Government Q&amp;A and E-Health Q&amp;A, achieving significant improvement in professionalism compared to RAG methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13731v3-abstract-full').style.display = 'none'; document.getElementById('2409.13731v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+M&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10