CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 847 results for author: <span class="mathjax">Han, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Han%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Han, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Han%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Han, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13306">arXiv:2411.13306</a> <span> [<a href="https://arxiv.org/pdf/2411.13306">pdf</a>, <a href="https://arxiv.org/format/2411.13306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Flexible electrical impedance tomography for tactile interfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+H">Huazhi Dong</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+S">Sihao Teng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaopeng Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Giorgio-Serchi%2C+F">Francesco Giorgio-Serchi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yunjie Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13306v1-abstract-short" style="display: inline;"> Flexible electrical impedance tomography (EIT) is an emerging technology for tactile sensing in human-machine interfaces (HMI). It offers a unique alternative to traditional array-based tactile sensors with its flexible, scalable, and cost-effective one-piece design. This paper proposes a lattice-patterned flexible EIT tactile sensor with a hydrogel-based conductive layer, designed for enhanced se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13306v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13306v1-abstract-full" style="display: none;"> Flexible electrical impedance tomography (EIT) is an emerging technology for tactile sensing in human-machine interfaces (HMI). It offers a unique alternative to traditional array-based tactile sensors with its flexible, scalable, and cost-effective one-piece design. This paper proposes a lattice-patterned flexible EIT tactile sensor with a hydrogel-based conductive layer, designed for enhanced sensitivity while maintaining durability. We conducted simulation studies to explore the influence of lattice width and conductive layer thickness on sensor performance, establishing optimized sensor design parameters for enhanced functionality. Experimental evaluations demonstrate the sensor's capacity to detect diverse tactile patterns with a high accuracy. The practical utility of the sensor is demonstrated through its integration within an HMI setup to control a virtual game, showcasing its potential for dynamic, multi-functional tactile interactions in real-time applications. This study reinforces the potential of EIT-based flexible tactile sensors, establishing a foundation for future advancements in wearable, adaptable HMI technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13306v1-abstract-full').style.display = 'none'; document.getElementById('2411.13306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11507">arXiv:2411.11507</a> <span> [<a href="https://arxiv.org/pdf/2411.11507">pdf</a>, <a href="https://arxiv.org/format/2411.11507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SignEye: Traffic Sign Interpretation from Vehicle First-Person View </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&query=SU%2C+Y">Yuejiao SU</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11507v1-abstract-short" style="display: inline;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction naviga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11507v1-abstract-full" style="display: none;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction navigation. Following the above issues, we introduce a new task: traffic sign interpretation from the vehicle's first-person view, referred to as TSI-FPV. Meanwhile, we develop a traffic guidance assistant (TGA) scenario application to re-explore the role of traffic signs in ADS as a complement to popular autonomous technologies (such as obstacle perception). Notably, TGA is not a replacement for electronic map navigation; rather, TGA can be an automatic tool for updating it and complementing it in situations such as offline conditions or temporary sign adjustments. Lastly, a spatial and semantic logic-aware stepwise reasoning pipeline (SignEye) is constructed to achieve the TSI-FPV and TGA, and an application-specific dataset (Traffic-CN) is built. Experiments show that TSI-FPV and TGA are achievable via our SignEye trained on Traffic-CN. The results also demonstrate that the TGA can provide complementary information to ADS beyond existing popular autonomous technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'none'; document.getElementById('2411.11507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11504">arXiv:2411.11504</a> <span> [<a href="https://arxiv.org/pdf/2411.11504">pdf</a>, <a href="https://arxiv.org/format/2411.11504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Search, Verify and Feedback: Towards Next Generation Post-training Paradigm of Foundation Models via Verifier Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xinyan Guan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanjiang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+J">Jie Lou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11504v1-abstract-short" style="display: inline;"> The evolution of machine learning has increasingly prioritized the development of powerful models and more scalable supervision signals. However, the emergence of foundation models presents significant challenges in providing effective supervision signals necessary for further enhancing their capabilities. Consequently, there is an urgent need to explore novel supervision signals and technical app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11504v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11504v1-abstract-full" style="display: none;"> The evolution of machine learning has increasingly prioritized the development of powerful models and more scalable supervision signals. However, the emergence of foundation models presents significant challenges in providing effective supervision signals necessary for further enhancing their capabilities. Consequently, there is an urgent need to explore novel supervision signals and technical approaches. In this paper, we propose verifier engineering, a novel post-training paradigm specifically designed for the era of foundation models. The core of verifier engineering involves leveraging a suite of automated verifiers to perform verification tasks and deliver meaningful feedback to foundation models. We systematically categorize the verifier engineering process into three essential stages: search, verify, and feedback, and provide a comprehensive review of state-of-the-art research developments within each stage. We believe that verifier engineering constitutes a fundamental pathway toward achieving Artificial General Intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11504v1-abstract-full').style.display = 'none'; document.getElementById('2411.11504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11161">arXiv:2411.11161</a> <span> [<a href="https://arxiv.org/pdf/2411.11161">pdf</a>, <a href="https://arxiv.org/format/2411.11161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MPLite: Multi-Aspect Pretraining for Mining Clinical Health Records </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+E">Eric Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoxue Han</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yue Ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11161v1-abstract-short" style="display: inline;"> The adoption of digital systems in healthcare has resulted in the accumulation of vast electronic health records (EHRs), offering valuable data for machine learning methods to predict patient health outcomes. However, single-visit records of patients are often neglected in the training process due to the lack of annotations of next-visit information, thereby limiting the predictive and expressive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11161v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11161v1-abstract-full" style="display: none;"> The adoption of digital systems in healthcare has resulted in the accumulation of vast electronic health records (EHRs), offering valuable data for machine learning methods to predict patient health outcomes. However, single-visit records of patients are often neglected in the training process due to the lack of annotations of next-visit information, thereby limiting the predictive and expressive power of machine learning models. In this paper, we present a novel framework MPLite that utilizes Multi-aspect Pretraining with Lab results through a light-weight neural network to enhance medical concept representation and predict future health outcomes of individuals. By incorporating both structured medical data and additional information from lab results, our approach fully leverages patient admission records. We design a pretraining module that predicts medical codes based on lab results, ensuring robust prediction by fusing multiple aspects of features. Our experimental evaluation using both MIMIC-III and MIMIC-IV datasets demonstrates improvements over existing models in diagnosis prediction and heart failure prediction tasks, achieving a higher weighted-F1 and recall with MPLite. This work reveals the potential of integrating diverse aspects of data to advance predictive modeling in healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11161v1-abstract-full').style.display = 'none'; document.getElementById('2411.11161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10033">arXiv:2411.10033</a> <span> [<a href="https://arxiv.org/pdf/2411.10033">pdf</a>, <a href="https://arxiv.org/format/2411.10033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1111/cgf.15215">10.1111/cgf.15215 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> GSEditPro: 3D Gaussian Splatting Editing with Attention-based Progressive Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanhao Sun</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+R">RunZe Tian</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">XinYao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kai Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10033v1-abstract-short" style="display: inline;"> With the emergence of large-scale Text-to-Image(T2I) models and implicit 3D representations like Neural Radiance Fields (NeRF), many text-driven generative editing methods based on NeRF have appeared. However, the implicit encoding of geometric and textural information poses challenges in accurately locating and controlling objects during editing. Recently, significant advancements have been made… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10033v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10033v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10033v1-abstract-full" style="display: none;"> With the emergence of large-scale Text-to-Image(T2I) models and implicit 3D representations like Neural Radiance Fields (NeRF), many text-driven generative editing methods based on NeRF have appeared. However, the implicit encoding of geometric and textural information poses challenges in accurately locating and controlling objects during editing. Recently, significant advancements have been made in the editing methods of 3D Gaussian Splatting, a real-time rendering technology that relies on explicit representation. However, these methods still suffer from issues including inaccurate localization and limited manipulation over editing. To tackle these challenges, we propose GSEditPro, a novel 3D scene editing framework which allows users to perform various creative and precise editing using text prompts only. Leveraging the explicit nature of the 3D Gaussian distribution, we introduce an attention-based progressive localization module to add semantic labels to each Gaussian during rendering. This enables precise localization on editing areas by classifying Gaussians based on their relevance to the editing prompts derived from cross-attention layers of the T2I model. Furthermore, we present an innovative editing optimization method based on 3D Gaussian Splatting, obtaining stable and refined editing results through the guidance of Score Distillation Sampling and pseudo ground truth. We prove the efficacy of our method through extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10033v1-abstract-full').style.display = 'none'; document.getElementById('2411.10033v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pacific Graphics 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computer Graphics Forum (2024), 43: e15215 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09429">arXiv:2411.09429</a> <span> [<a href="https://arxiv.org/pdf/2411.09429">pdf</a>, <a href="https://arxiv.org/format/2411.09429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Superconductivity">cond-mat.supr-con</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI-driven inverse design of materials: Past, present and future </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao-Qi Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin-De Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meng-Yuan Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhen Feng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bo-Wen Yao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peng-Jie Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ze-Feng Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhong-Yi Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09429v1-abstract-short" style="display: inline;"> The discovery of advanced materials is the cornerstone of human technological development and progress. The structures of materials and their corresponding properties are essentially the result of a complex interplay of multiple degrees of freedom such as lattice, charge, spin, symmetry, and topology. This poses significant challenges for the inverse design methods of materials. Humans have long e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09429v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09429v1-abstract-full" style="display: none;"> The discovery of advanced materials is the cornerstone of human technological development and progress. The structures of materials and their corresponding properties are essentially the result of a complex interplay of multiple degrees of freedom such as lattice, charge, spin, symmetry, and topology. This poses significant challenges for the inverse design methods of materials. Humans have long explored new materials through a large number of experiments and proposed corresponding theoretical systems to predict new material properties and structures. With the improvement of computational power, researchers have gradually developed various electronic structure calculation methods, particularly such as the one based density functional theory, as well as high-throughput computational methods. Recently, the rapid development of artificial intelligence technology in the field of computer science has enabled the effective characterization of the implicit association between material properties and structures, thus opening up an efficient paradigm for the inverse design of functional materials. A significant progress has been made in inverse design of materials based on generative and discriminative models, attracting widespread attention from researchers. Considering this rapid technological progress, in this survey, we look back on the latest advancements in AI-driven inverse design of materials by introducing the background, key findings, and mainstream technological development routes. In addition, we summarize the remaining issues for future directions. This survey provides the latest overview of AI-driven inverse design of materials, which can serve as a useful resource for researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09429v1-abstract-full').style.display = 'none'; document.getElementById('2411.09429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages, 5 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07579">arXiv:2411.07579</a> <span> [<a href="https://arxiv.org/pdf/2411.07579">pdf</a>, <a href="https://arxiv.org/format/2411.07579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Projecting Gaussian Ellipsoids While Avoiding Affine Projection Approximation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+H">Han Qi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tao Cai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiyue Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07579v3-abstract-short" style="display: inline;"> Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its real-time rendering speed and state-of-the-art rendering quality. However, during the rendering process, the use of the Jacobian of the affine approximation of the projection transformation leads to inevitable errors, resulting in blurriness, artifacts and a lack of scene consistency in the final rendered images. To addres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07579v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07579v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07579v3-abstract-full" style="display: none;"> Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its real-time rendering speed and state-of-the-art rendering quality. However, during the rendering process, the use of the Jacobian of the affine approximation of the projection transformation leads to inevitable errors, resulting in blurriness, artifacts and a lack of scene consistency in the final rendered images. To address this issue, we introduce an ellipsoid-based projection method to calculate the projection of Gaussian ellipsoid onto the image plane, which is the primitive of 3D Gaussian Splatting. As our proposed ellipsoid-based projection method cannot handle Gaussian ellipsoids with camera origins inside them or parts lying below $z=0$ plane in the camera space, we designed a pre-filtering strategy. Experiments over multiple widely adopted benchmark datasets show that our ellipsoid-based projection method can enhance the rendering quality of 3D Gaussian Splatting and its extensions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07579v3-abstract-full').style.display = 'none'; document.getElementById('2411.07579v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05738">arXiv:2411.05738</a> <span> [<a href="https://arxiv.org/pdf/2411.05738">pdf</a>, <a href="https://arxiv.org/format/2411.05738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StdGEN: Semantic-Decomposed 3D Character Generation from Single Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuze He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhongkai Wu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kaiwen Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong-Jin Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05738v1-abstract-short" style="display: inline;"> We present StdGEN, an innovative pipeline for generating semantically decomposed high-quality 3D characters from single images, enabling broad applications in virtual reality, gaming, and filmmaking, etc. Unlike previous methods which struggle with limited decomposability, unsatisfactory quality, and long optimization times, StdGEN features decomposability, effectiveness and efficiency; i.e., it g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05738v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05738v1-abstract-full" style="display: none;"> We present StdGEN, an innovative pipeline for generating semantically decomposed high-quality 3D characters from single images, enabling broad applications in virtual reality, gaming, and filmmaking, etc. Unlike previous methods which struggle with limited decomposability, unsatisfactory quality, and long optimization times, StdGEN features decomposability, effectiveness and efficiency; i.e., it generates intricately detailed 3D characters with separated semantic components such as the body, clothes, and hair, in three minutes. At the core of StdGEN is our proposed Semantic-aware Large Reconstruction Model (S-LRM), a transformer-based generalizable model that jointly reconstructs geometry, color and semantics from multi-view images in a feed-forward manner. A differentiable multi-layer semantic surface extraction scheme is introduced to acquire meshes from hybrid implicit fields reconstructed by our S-LRM. Additionally, a specialized efficient multi-view diffusion model and an iterative multi-layer surface refinement module are integrated into the pipeline to facilitate high-quality, decomposable 3D character generation. Extensive experiments demonstrate our state-of-the-art performance in 3D anime character generation, surpassing existing baselines by a significant margin in geometry, texture and decomposability. StdGEN offers ready-to-use semantic-decomposed 3D characters and enables flexible customization for a wide range of applications. Project page: https://stdgen.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05738v1-abstract-full').style.display = 'none'; document.getElementById('2411.05738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04788">arXiv:2411.04788</a> <span> [<a href="https://arxiv.org/pdf/2411.04788">pdf</a>, <a href="https://arxiv.org/format/2411.04788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Trading and Market Microstructure">q-fin.TR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Investment Analysis: Optimizing AI-Agent Collaboration in Financial Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xuewen Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Neng Wang</a>, <a href="/search/cs?searchtype=author&query=Che%2C+S">Shangkun Che</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongyang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kunpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S+X">Sean Xin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04788v1-abstract-short" style="display: inline;"> In recent years, the application of generative artificial intelligence (GenAI) in financial analysis and investment decision-making has gained significant attention. However, most existing approaches rely on single-agent systems, which fail to fully utilize the collaborative potential of multiple AI agents. In this paper, we propose a novel multi-agent collaboration system designed to enhance deci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04788v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04788v1-abstract-full" style="display: none;"> In recent years, the application of generative artificial intelligence (GenAI) in financial analysis and investment decision-making has gained significant attention. However, most existing approaches rely on single-agent systems, which fail to fully utilize the collaborative potential of multiple AI agents. In this paper, we propose a novel multi-agent collaboration system designed to enhance decision-making in financial investment research. The system incorporates agent groups with both configurable group sizes and collaboration structures to leverage the strengths of each agent group type. By utilizing a sub-optimal combination strategy, the system dynamically adapts to varying market conditions and investment scenarios, optimizing performance across different tasks. We focus on three sub-tasks: fundamentals, market sentiment, and risk analysis, by analyzing the 2023 SEC 10-K forms of 30 companies listed on the Dow Jones Index. Our findings reveal significant performance variations based on the configurations of AI agents for different tasks. The results demonstrate that our multi-agent collaboration system outperforms traditional single-agent models, offering improved accuracy, efficiency, and adaptability in complex financial environments. This study highlights the potential of multi-agent systems in transforming financial analysis and investment decision-making by integrating diverse analytical perspectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04788v1-abstract-full').style.display = 'none'; document.getElementById('2411.04788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03047">arXiv:2411.03047</a> <span> [<a href="https://arxiv.org/pdf/2411.03047">pdf</a>, <a href="https://arxiv.org/format/2411.03047">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> GarVerseLOD: High-Fidelity 3D Garment Reconstruction from a Single In-the-Wild Image using a Dataset with Levels of Details </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhongjin Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haolin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghong Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Wanghao Du</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zirong Jin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wanhu Sun</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yinyu Nie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weikai Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03047v1-abstract-short" style="display: inline;"> Neural implicit functions have brought impressive advances to the state-of-the-art of clothed human digitization from multiple or even single images. However, despite the progress, current arts still have difficulty generalizing to unseen images with complex cloth deformation and body poses. In this work, we present GarVerseLOD, a new dataset and framework that paves the way to achieving unprecede… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03047v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03047v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03047v1-abstract-full" style="display: none;"> Neural implicit functions have brought impressive advances to the state-of-the-art of clothed human digitization from multiple or even single images. However, despite the progress, current arts still have difficulty generalizing to unseen images with complex cloth deformation and body poses. In this work, we present GarVerseLOD, a new dataset and framework that paves the way to achieving unprecedented robustness in high-fidelity 3D garment reconstruction from a single unconstrained image. Inspired by the recent success of large generative models, we believe that one key to addressing the generalization challenge lies in the quantity and quality of 3D garment data. Towards this end, GarVerseLOD collects 6,000 high-quality cloth models with fine-grained geometry details manually created by professional artists. In addition to the scale of training data, we observe that having disentangled granularities of geometry can play an important role in boosting the generalization capability and inference accuracy of the learned model. We hence craft GarVerseLOD as a hierarchical dataset with levels of details (LOD), spanning from detail-free stylized shape to pose-blended garment with pixel-aligned details. This allows us to make this highly under-constrained problem tractable by factorizing the inference into easier tasks, each narrowed down with smaller searching space. To ensure GarVerseLOD can generalize well to in-the-wild images, we propose a novel labeling paradigm based on conditional diffusion models to generate extensive paired images for each garment model with high photorealism. We evaluate our method on a massive amount of in-the-wild images. Experimental results demonstrate that GarVerseLOD can generate standalone garment pieces with significantly better quality than prior approaches. Project page: https://garverselod.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03047v1-abstract-full').style.display = 'none'; document.getElementById('2411.03047v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://garverselod.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02794">arXiv:2411.02794</a> <span> [<a href="https://arxiv.org/pdf/2411.02794">pdf</a>, <a href="https://arxiv.org/format/2411.02794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Text Detection with Similar Mask in Traffic, Industrial, and Natural Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02794v1-abstract-short" style="display: inline;"> Texts on the intelligent transportation scene include mass information. Fully harnessing this information is one of the critical drivers for advancing intelligent transportation. Unlike the general scene, detecting text in transportation has extra demand, such as a fast inference speed, except for high accuracy. Most existing real-time text detection methods are based on the shrink mask, which los… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02794v1-abstract-full" style="display: none;"> Texts on the intelligent transportation scene include mass information. Fully harnessing this information is one of the critical drivers for advancing intelligent transportation. Unlike the general scene, detecting text in transportation has extra demand, such as a fast inference speed, except for high accuracy. Most existing real-time text detection methods are based on the shrink mask, which loses some geometry semantic information and needs complex post-processing. In addition, the previous method usually focuses on correct output, which ignores feature correction and lacks guidance during the intermediate process. To this end, we propose an efficient multi-scene text detector that contains an effective text representation similar mask (SM) and a feature correction module (FCM). Unlike previous methods, the former aims to preserve the geometric information of the instances as much as possible. Its post-progressing saves 50$\%$ of the time, accurately and efficiently reconstructing text contours. The latter encourages false positive features to move away from the positive feature center, optimizing the predictions from the feature level. Some ablation studies demonstrate the efficiency of the SM and the effectiveness of the FCM. Moreover, the deficiency of existing traffic datasets (such as the low-quality annotation or closed source data unavailability) motivated us to collect and annotate a traffic text dataset, which introduces motion blur. In addition, to validate the scene robustness of the SM-Net, we conduct experiments on traffic, industrial, and natural scene datasets. Extensive experiments verify it achieves (SOTA) performance on several benchmarks. The code and dataset are available at: \url{https://github.com/fengmulin/SMNet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02794v1-abstract-full').style.display = 'none'; document.getElementById('2411.02794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02335">arXiv:2411.02335</a> <span> [<a href="https://arxiv.org/pdf/2411.02335">pdf</a>, <a href="https://arxiv.org/format/2411.02335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Sparsing Law: Towards Large Language Models with Greater Activation Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuqi Luo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Chenyang Song</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingfa Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02335v1-abstract-short" style="display: inline;"> Activation sparsity denotes the existence of substantial weakly-contributed elements within activation outputs that can be eliminated, benefiting many important applications concerned with large language models (LLMs). Although promoting greater activation sparsity within LLMs deserves deep studies, existing works lack comprehensive and quantitative research on the correlation between activation s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02335v1-abstract-full" style="display: none;"> Activation sparsity denotes the existence of substantial weakly-contributed elements within activation outputs that can be eliminated, benefiting many important applications concerned with large language models (LLMs). Although promoting greater activation sparsity within LLMs deserves deep studies, existing works lack comprehensive and quantitative research on the correlation between activation sparsity and potentially influential factors. In this paper, we present a comprehensive study on the quantitative scaling properties and influential factors of the activation sparsity within decoder-only Transformer-based LLMs. Specifically, we propose PPL-$p\%$ sparsity, a precise and performance-aware activation sparsity metric that is applicable to any activation function. Through extensive experiments, we find several important phenomena. Firstly, different activation functions exhibit comparable performance but opposite training-time sparsity trends. The activation ratio (i.e., $1-\mathrm{sparsity\ ratio}$) evolves as a convergent increasing power-law and decreasing logspace power-law with the amount of training data for SiLU-activated and ReLU-activated LLMs, respectively. These demonstrate that ReLU is more efficient as the activation function than SiLU and can leverage more training data to improve activation sparsity. Secondly, the activation ratio linearly increases with the width-depth ratio below a certain bottleneck point, indicating the potential advantage of a deeper architecture at a fixed parameter scale. Finally, at similar width-depth ratios, we surprisingly find that the limit value of activation sparsity varies weakly with the parameter scale, i.e., the activation patterns within LLMs are insensitive to the parameter scale. These empirical laws towards LLMs with greater activation sparsity have important implications for making LLMs more efficient and interpretable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02335v1-abstract-full').style.display = 'none'; document.getElementById('2411.02335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 13 figures, 6 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01503">arXiv:2411.01503</a> <span> [<a href="https://arxiv.org/pdf/2411.01503">pdf</a>, <a href="https://arxiv.org/format/2411.01503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> LumosCore: Highly Scalable LLM Clusters with Optical Interconnect </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xinchi Han</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shizhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yongxi Lv</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+P">Peirui Cao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shengkai Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinbing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01503v1-abstract-short" style="display: inline;"> The emergence of Large Language Model(LLM) technologies has led to a rapidly growing demand for compute resources in models. In response, the enterprises are building large-scale multi-tenant GPU clusters with 10k or even ore GPUs. In contrast to the rapidly growing cluster size, the bandwidth of clusters has also been increasing to meet communication demands, with 800 Gbps optical modules already… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01503v1-abstract-full" style="display: none;"> The emergence of Large Language Model(LLM) technologies has led to a rapidly growing demand for compute resources in models. In response, the enterprises are building large-scale multi-tenant GPU clusters with 10k or even ore GPUs. In contrast to the rapidly growing cluster size, the bandwidth of clusters has also been increasing to meet communication demands, with 800 Gbps optical modules already in practical use and 1.6 Tbps modules on the horizon. However, designing clusters that simultaneously meet the requirements of large scale and high bandwidth is challenging due to the limited capacity of electrical switch chips. Unlike electrical switch chips, the single-port bandwidth of MEMS-OCS is solely determined by the optical module, making it straightforward to achieve both bandwidth and scability requirement. In this paper, we propose an opto-electronic hybrid architecture called \textbf{LumosCore}. We address the issues of L2 protocols incompatibility potential network contention and algorithm time complexity through physical topology and logical topology design. Additionally, we design a polynomial-time complexity link reconfiguration algorithm to reconfigure MEMS-OCS with minimal time overhead. We validate the feasibility of the proposed scheme in a cluster consisting of 128 NPUs, and through simulation based on real traces, we demonstrate the superiority of \textbf{LumosCore} over traditional architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01503v1-abstract-full').style.display = 'none'; document.getElementById('2411.01503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00823">arXiv:2411.00823</a> <span> [<a href="https://arxiv.org/pdf/2411.00823">pdf</a>, <a href="https://arxiv.org/format/2411.00823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Mobility-LLM: Learning Visiting Intentions and Travel Preferences from Human Mobility Data with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+L">Letian Gong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiwen Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xuedi Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00823v1-abstract-short" style="display: inline;"> Location-based services (LBS) have accumulated extensive human mobility data on diverse behaviors through check-in sequences. These sequences offer valuable insights into users' intentions and preferences. Yet, existing models analyzing check-in sequences fail to consider the semantics contained in these sequences, which closely reflect human visiting intentions and travel preferences, leading to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00823v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00823v1-abstract-full" style="display: none;"> Location-based services (LBS) have accumulated extensive human mobility data on diverse behaviors through check-in sequences. These sequences offer valuable insights into users' intentions and preferences. Yet, existing models analyzing check-in sequences fail to consider the semantics contained in these sequences, which closely reflect human visiting intentions and travel preferences, leading to an incomplete comprehension. Drawing inspiration from the exceptional semantic understanding and contextual information processing capabilities of large language models (LLMs) across various domains, we present Mobility-LLM, a novel framework that leverages LLMs to analyze check-in sequences for multiple tasks. Since LLMs cannot directly interpret check-ins, we reprogram these sequences to help LLMs comprehensively understand the semantics of human visiting intentions and travel preferences. Specifically, we introduce a visiting intention memory network (VIMN) to capture the visiting intentions at each record, along with a shared pool of human travel preference prompts (HTPP) to guide the LLM in understanding users' travel preferences. These components enhance the model's ability to extract and leverage semantic information from human mobility data effectively. Extensive experiments on four benchmark datasets and three downstream tasks demonstrate that our approach significantly outperforms existing models, underscoring the effectiveness of Mobility-LLM in advancing our understanding of human mobility data within LBS contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00823v1-abstract-full').style.display = 'none'; document.getElementById('2411.00823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21083">arXiv:2410.21083</a> <span> [<a href="https://arxiv.org/pdf/2410.21083">pdf</a>, <a href="https://arxiv.org/format/2410.21083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Stealthy Jailbreak Attacks on Large Language Models via Benign Data Mirroring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+H">Honglin Mu</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Han He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yunlong Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yang Xu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaoming Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeming Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xudong Han</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Q">Qi Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qingfu Zhu</a>, <a href="/search/cs?searchtype=author&query=Che%2C+W">Wanxiang Che</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21083v1-abstract-short" style="display: inline;"> Large language model (LLM) safety is a critical issue, with numerous studies employing red team testing to enhance model security. Among these, jailbreak methods explore potential vulnerabilities by crafting malicious prompts that induce model outputs contrary to safety alignments. Existing black-box jailbreak methods often rely on model feedback, repeatedly submitting queries with detectable mali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21083v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21083v1-abstract-full" style="display: none;"> Large language model (LLM) safety is a critical issue, with numerous studies employing red team testing to enhance model security. Among these, jailbreak methods explore potential vulnerabilities by crafting malicious prompts that induce model outputs contrary to safety alignments. Existing black-box jailbreak methods often rely on model feedback, repeatedly submitting queries with detectable malicious instructions during the attack search process. Although these approaches are effective, the attacks may be intercepted by content moderators during the search process. We propose an improved transfer attack method that guides malicious prompt construction by locally training a mirror model of the target black-box model through benign data distillation. This method offers enhanced stealth, as it does not involve submitting identifiable malicious instructions to the target model during the search phase. Our approach achieved a maximum attack success rate of 92%, or a balanced value of 80% with an average of 1.5 detectable jailbreak queries per sample against GPT-3.5 Turbo on a subset of AdvBench. These results underscore the need for more robust defense mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21083v1-abstract-full').style.display = 'none'; document.getElementById('2410.21083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21027">arXiv:2410.21027</a> <span> [<a href="https://arxiv.org/pdf/2410.21027">pdf</a>, <a href="https://arxiv.org/format/2410.21027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Transferable Post-training via Inverse Value Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xueru Wen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21027v1-abstract-short" style="display: inline;"> As post-training processes utilize increasingly large datasets and base models continue to grow in size, the computational demands and implementation challenges of existing algorithms are escalating significantly. In this paper, we propose modeling the changes at the logits level during post-training using a separate neural network (i.e., the value network). After training this network on a small… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21027v1-abstract-full" style="display: none;"> As post-training processes utilize increasingly large datasets and base models continue to grow in size, the computational demands and implementation challenges of existing algorithms are escalating significantly. In this paper, we propose modeling the changes at the logits level during post-training using a separate neural network (i.e., the value network). After training this network on a small base model using demonstrations, this network can be seamlessly integrated with other pre-trained models during inference, enables them to achieve similar capability enhancements. We systematically investigate the best practices for this paradigm in terms of pre-training weights and connection schemes. We demonstrate that the resulting value network has broad transferability across pre-trained models of different parameter sizes within the same family, models undergoing continuous pre-training within the same family, and models with different vocabularies across families. In certain cases, it can achieve performance comparable to full-parameter fine-tuning. Furthermore, we explore methods to enhance the transferability of the value model and prevent overfitting to the base model used during training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21027v1-abstract-full').style.display = 'none'; document.getElementById('2410.21027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20295">arXiv:2410.20295</a> <span> [<a href="https://arxiv.org/pdf/2410.20295">pdf</a>, <a href="https://arxiv.org/format/2410.20295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeCaf: A Causal Decoupling Framework for OOD Generalization on Node Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoxue Han</a>, <a href="/search/cs?searchtype=author&query=Rangwala%2C+H">Huzefa Rangwala</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yue Ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20295v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) are susceptible to distribution shifts, creating vulnerability and security issues in critical domains. There is a pressing need to enhance the generalizability of GNNs on out-of-distribution (OOD) test data. Existing methods that target learning an invariant (feature, structure)-label mapping often depend on oversimplified assumptions about the data generation process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20295v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20295v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) are susceptible to distribution shifts, creating vulnerability and security issues in critical domains. There is a pressing need to enhance the generalizability of GNNs on out-of-distribution (OOD) test data. Existing methods that target learning an invariant (feature, structure)-label mapping often depend on oversimplified assumptions about the data generation process, which do not adequately reflect the actual dynamics of distribution shifts in graphs. In this paper, we introduce a more realistic graph data generation model using Structural Causal Models (SCMs), allowing us to redefine distribution shifts by pinpointing their origins within the generation process. Building on this, we propose a casual decoupling framework, DeCaf, that independently learns unbiased feature-label and structure-label mappings. We provide a detailed theoretical framework that shows how our approach can effectively mitigate the impact of various distribution shifts. We evaluate DeCaf across both real-world and synthetic datasets that demonstrate different patterns of shifts, confirming its efficacy in enhancing the generalizability of GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20295v1-abstract-full').style.display = 'none'; document.getElementById('2410.20295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20280">arXiv:2410.20280</a> <span> [<a href="https://arxiv.org/pdf/2410.20280">pdf</a>, <a href="https://arxiv.org/format/2410.20280">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MarDini: Masked Autoregressive Diffusion for Video Generation at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haozhe Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shikun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zijian Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengmeng Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanping Xie</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao Han</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez%2C+J+C">Juan C. P茅rez</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Ding Liu</a>, <a href="/search/cs?searchtype=author&query=Kahatapitiya%2C+K">Kumara Kahatapitiya</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+M">Menglin Jia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jui-Chieh Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Sen He</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+T">Tao Xiang</a>, <a href="/search/cs?searchtype=author&query=Schmidhuber%2C+J">J眉rgen Schmidhuber</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez-R%C3%BAa%2C+J">Juan-Manuel P茅rez-R煤a</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20280v1-abstract-short" style="display: inline;"> We introduce MarDini, a new family of video diffusion models that integrate the advantages of masked auto-regression (MAR) into a unified diffusion model (DM) framework. Here, MAR handles temporal planning, while DM focuses on spatial generation in an asymmetric network design: i) a MAR-based planning model containing most of the parameters generates planning signals for each masked frame using lo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20280v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20280v1-abstract-full" style="display: none;"> We introduce MarDini, a new family of video diffusion models that integrate the advantages of masked auto-regression (MAR) into a unified diffusion model (DM) framework. Here, MAR handles temporal planning, while DM focuses on spatial generation in an asymmetric network design: i) a MAR-based planning model containing most of the parameters generates planning signals for each masked frame using low-resolution input; ii) a lightweight generation model uses these signals to produce high-resolution frames via diffusion de-noising. MarDini's MAR enables video generation conditioned on any number of masked frames at any frame positions: a single model can handle video interpolation (e.g., masking middle frames), image-to-video generation (e.g., masking from the second frame onward), and video expansion (e.g., masking half the frames). The efficient design allocates most of the computational resources to the low-resolution planning model, making computationally expensive but important spatio-temporal attention feasible at scale. MarDini sets a new state-of-the-art for video interpolation; meanwhile, within few inference steps, it efficiently generates videos on par with those of much more expensive advanced image-to-video models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20280v1-abstract-full').style.display = 'none'; document.getElementById('2410.20280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://mardini-vidgen.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18666">arXiv:2410.18666</a> <span> [<a href="https://arxiv.org/pdf/2410.18666">pdf</a>, <a href="https://arxiv.org/format/2410.18666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamClear: High-Capacity Real-World Image Restoration with Privacy-Safe Dataset Curation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ai%2C+Y">Yuang Ai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaoqiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huaibo Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaotian Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=You%2C+Q">Quanzeng You</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18666v2-abstract-short" style="display: inline;"> Image restoration (IR) in real-world scenarios presents significant challenges due to the lack of high-capacity models and comprehensive datasets. To tackle these issues, we present a dual strategy: GenIR, an innovative data curation pipeline, and DreamClear, a cutting-edge Diffusion Transformer (DiT)-based image restoration model. GenIR, our pioneering contribution, is a dual-prompt learning pipe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18666v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18666v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18666v2-abstract-full" style="display: none;"> Image restoration (IR) in real-world scenarios presents significant challenges due to the lack of high-capacity models and comprehensive datasets. To tackle these issues, we present a dual strategy: GenIR, an innovative data curation pipeline, and DreamClear, a cutting-edge Diffusion Transformer (DiT)-based image restoration model. GenIR, our pioneering contribution, is a dual-prompt learning pipeline that overcomes the limitations of existing datasets, which typically comprise only a few thousand images and thus offer limited generalizability for larger models. GenIR streamlines the process into three stages: image-text pair construction, dual-prompt based fine-tuning, and data generation & filtering. This approach circumvents the laborious data crawling process, ensuring copyright compliance and providing a cost-effective, privacy-safe solution for IR dataset construction. The result is a large-scale dataset of one million high-quality images. Our second contribution, DreamClear, is a DiT-based image restoration model. It utilizes the generative priors of text-to-image (T2I) diffusion models and the robust perceptual capabilities of multi-modal large language models (MLLMs) to achieve photorealistic restoration. To boost the model's adaptability to diverse real-world degradations, we introduce the Mixture of Adaptive Modulator (MoAM). It employs token-wise degradation priors to dynamically integrate various restoration experts, thereby expanding the range of degradations the model can address. Our exhaustive experiments confirm DreamClear's superior performance, underlining the efficacy of our dual strategy for real-world image restoration. Code and pre-trained models are available at: https://github.com/shallowdream204/DreamClear. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18666v2-abstract-full').style.display = 'none'; document.getElementById('2410.18666v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17131">arXiv:2410.17131</a> <span> [<a href="https://arxiv.org/pdf/2410.17131">pdf</a>, <a href="https://arxiv.org/format/2410.17131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Aligning Large Language Models via Self-Steering Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+H">Hao Xiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17131v1-abstract-short" style="display: inline;"> Automated alignment develops alignment systems with minimal human intervention. The key to automated alignment lies in providing learnable and accurate preference signals for preference learning without human annotation. In this paper, we introduce Self-Steering Optimization ($SSO$), an algorithm that autonomously generates high-quality preference signals based on predefined principles during iter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17131v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17131v1-abstract-full" style="display: none;"> Automated alignment develops alignment systems with minimal human intervention. The key to automated alignment lies in providing learnable and accurate preference signals for preference learning without human annotation. In this paper, we introduce Self-Steering Optimization ($SSO$), an algorithm that autonomously generates high-quality preference signals based on predefined principles during iterative training, eliminating the need for manual annotation. $SSO$ maintains the accuracy of signals by ensuring a consistent gap between chosen and rejected responses while keeping them both on-policy to suit the current policy model's learning capacity. $SSO$ can benefit the online and offline training of the policy model, as well as enhance the training of reward models. We validate the effectiveness of $SSO$ with two foundation models, Qwen2 and Llama3.1, indicating that it provides accurate, on-policy preference signals throughout iterative training. Without any manual annotation or external models, $SSO$ leads to significant performance improvements across six subjective or objective benchmarks. Besides, the preference data generated by $SSO$ significantly enhanced the performance of the reward model on Rewardbench. Our work presents a scalable approach to preference optimization, paving the way for more efficient and effective automated alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17131v1-abstract-full').style.display = 'none'; document.getElementById('2410.17131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16719">arXiv:2410.16719</a> <span> [<a href="https://arxiv.org/pdf/2410.16719">pdf</a>, <a href="https://arxiv.org/format/2410.16719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Progressive Compositionality In Text-to-Image Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Linghao Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P+P">Paul Pu Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16719v1-abstract-short" style="display: inline;"> Despite the impressive text-to-image (T2I) synthesis capabilities of diffusion models, they often struggle to understand compositional relationships between objects and attributes, especially in complex settings. Existing solutions have tackled these challenges by optimizing the cross-attention mechanism or learning from the caption pairs with minimal semantic changes. However, can we generate hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16719v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16719v1-abstract-full" style="display: none;"> Despite the impressive text-to-image (T2I) synthesis capabilities of diffusion models, they often struggle to understand compositional relationships between objects and attributes, especially in complex settings. Existing solutions have tackled these challenges by optimizing the cross-attention mechanism or learning from the caption pairs with minimal semantic changes. However, can we generate high-quality complex contrastive images that diffusion models can directly discriminate based on visual representations? In this work, we leverage large-language models (LLMs) to compose realistic, complex scenarios and harness Visual-Question Answering (VQA) systems alongside diffusion models to automatically curate a contrastive dataset, ConPair, consisting of 15k pairs of high-quality contrastive images. These pairs feature minimal visual discrepancies and cover a wide range of attribute categories, especially complex and natural scenarios. To learn effectively from these error cases, i.e., hard negative images, we propose EvoGen, a new multi-stage curriculum for contrastive learning of diffusion models. Through extensive experiments across a wide range of compositional scenarios, we showcase the effectiveness of our proposed framework on compositional T2I benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16719v1-abstract-full').style.display = 'none'; document.getElementById('2410.16719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16642">arXiv:2410.16642</a> <span> [<a href="https://arxiv.org/pdf/2410.16642">pdf</a>, <a href="https://arxiv.org/format/2410.16642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fire and Smoke Detection with Burning Intensity Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyi Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanfei Wu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+N">Nan Pu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Bei%2C+Y">Yijun Bei</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lechao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16642v1-abstract-short" style="display: inline;"> An effective Fire and Smoke Detection (FSD) and analysis system is of paramount importance due to the destructive potential of fire disasters. However, many existing FSD methods directly employ generic object detection techniques without considering the transparency of fire and smoke, which leads to imprecise localization and reduces detection performance. To address this issue, a new Attentive Fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16642v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16642v1-abstract-full" style="display: none;"> An effective Fire and Smoke Detection (FSD) and analysis system is of paramount importance due to the destructive potential of fire disasters. However, many existing FSD methods directly employ generic object detection techniques without considering the transparency of fire and smoke, which leads to imprecise localization and reduces detection performance. To address this issue, a new Attentive Fire and Smoke Detection Model (a-FSDM) is proposed. This model not only retains the robust feature extraction and fusion capabilities of conventional detection algorithms but also redesigns the detection head specifically for transparent targets in FSD, termed the Attentive Transparency Detection Head (ATDH). In addition, Burning Intensity (BI) is introduced as a pivotal feature for fire-related downstream risk assessments in traditional FSD methodologies. Extensive experiments on multiple FSD datasets showcase the effectiveness and versatility of the proposed FSD model. The project is available at \href{https://xiaoyihan6.github.io/FSD/}{https://xiaoyihan6.github.io/FSD/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16642v1-abstract-full').style.display = 'none'; document.getElementById('2410.16642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16631">arXiv:2410.16631</a> <span> [<a href="https://arxiv.org/pdf/2410.16631">pdf</a>, <a href="https://arxiv.org/format/2410.16631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Multi-Scene Fire and Smoke Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyi Han</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+N">Nan Pu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Bei%2C+Y">Yijun Bei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lechao Cheng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Liang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16631v1-abstract-short" style="display: inline;"> The current irregularities in existing public Fire and Smoke Detection (FSD) datasets have become a bottleneck in the advancement of FSD technology. Upon in-depth analysis, we identify the core issue as the lack of standardized dataset construction, uniform evaluation systems, and clear performance benchmarks. To address this issue and drive innovation in FSD technology, we systematically gather d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16631v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16631v1-abstract-full" style="display: none;"> The current irregularities in existing public Fire and Smoke Detection (FSD) datasets have become a bottleneck in the advancement of FSD technology. Upon in-depth analysis, we identify the core issue as the lack of standardized dataset construction, uniform evaluation systems, and clear performance benchmarks. To address this issue and drive innovation in FSD technology, we systematically gather diverse resources from public sources to create a more comprehensive and refined FSD benchmark. Additionally, recognizing the inadequate coverage of existing dataset scenes, we strategically expand scenes, relabel, and standardize existing public FSD datasets to ensure accuracy and consistency. We aim to establish a standardized, realistic, unified, and efficient FSD research platform that mirrors real-life scenes closely. Through our efforts, we aim to provide robust support for the breakthrough and development of FSD technology. The project is available at \href{https://xiaoyihan6.github.io/FSD/}{https://xiaoyihan6.github.io/FSD/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16631v1-abstract-full').style.display = 'none'; document.getElementById('2410.16631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15732">arXiv:2410.15732</a> <span> [<a href="https://arxiv.org/pdf/2410.15732">pdf</a>, <a href="https://arxiv.org/format/2410.15732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ViMoE: An Empirical Study of Designing Vision Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xumeng Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Longhui Wei</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+C">Chenhui Qiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xin He</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yingfei Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15732v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) models embody the divide-and-conquer concept and are a promising approach for increasing model capacity, demonstrating excellent scalability across multiple domains. In this paper, we integrate the MoE structure into the classic Vision Transformer (ViT), naming it ViMoE, and explore the potential of applying MoE to vision through a comprehensive study on image classificati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15732v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15732v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) models embody the divide-and-conquer concept and are a promising approach for increasing model capacity, demonstrating excellent scalability across multiple domains. In this paper, we integrate the MoE structure into the classic Vision Transformer (ViT), naming it ViMoE, and explore the potential of applying MoE to vision through a comprehensive study on image classification. However, we observe that the performance is sensitive to the configuration of MoE layers, making it challenging to obtain optimal results without careful design. The underlying cause is that inappropriate MoE layers lead to unreliable routing and hinder experts from effectively acquiring helpful knowledge. To address this, we introduce a shared expert to learn and capture common information, serving as an effective way to construct stable ViMoE. Furthermore, we demonstrate how to analyze expert routing behavior, revealing which MoE layers are capable of specializing in handling specific information and which are not. This provides guidance for retaining the critical layers while removing redundancies, thereby advancing ViMoE to be more efficient without sacrificing accuracy. We aspire for this work to offer new insights into the design of vision MoE models and provide valuable empirical guidance for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15732v1-abstract-full').style.display = 'none'; document.getElementById('2410.15732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15556">arXiv:2410.15556</a> <span> [<a href="https://arxiv.org/pdf/2410.15556">pdf</a>, <a href="https://arxiv.org/format/2410.15556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gradient Rewiring for Editable Graph Neural Network Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhimeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaotian Han</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Q">Qizhang Feng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hongye Jin</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Q">Qiaoyu Tan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kaixiong Zhou</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+N">Na Zou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15556v2-abstract-short" style="display: inline;"> Deep neural networks are ubiquitously adopted in many applications, such as computer vision, natural language processing, and graph analytics. However, well-trained neural networks can make prediction errors after deployment as the world changes. \textit{Model editing} involves updating the base model to correct prediction errors with less accessible training data and computational resources. Desp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15556v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15556v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15556v2-abstract-full" style="display: none;"> Deep neural networks are ubiquitously adopted in many applications, such as computer vision, natural language processing, and graph analytics. However, well-trained neural networks can make prediction errors after deployment as the world changes. \textit{Model editing} involves updating the base model to correct prediction errors with less accessible training data and computational resources. Despite recent advances in model editors in computer vision and natural language processing, editable training in graph neural networks (GNNs) is rarely explored. The challenge with editable GNN training lies in the inherent information aggregation across neighbors, which can lead model editors to affect the predictions of other nodes unintentionally. In this paper, we first observe the gradient of cross-entropy loss for the target node and training nodes with significant inconsistency, which indicates that directly fine-tuning the base model using the loss on the target node deteriorates the performance on training nodes. Motivated by the gradient inconsistency observation, we propose a simple yet effective \underline{G}radient \underline{R}ewiring method for \underline{E}ditable graph neural network training, named \textbf{GRE}. Specifically, we first store the anchor gradient of the loss on training nodes to preserve the locality. Subsequently, we rewire the gradient of the loss on the target node to preserve performance on the training node using anchor gradient. Experiments demonstrate the effectiveness of GRE on various model architectures and graph datasets in terms of multiple editing situations. The source code is available at \url{https://github.com/zhimengj0326/Gradient_rewiring_editing} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15556v2-abstract-full').style.display = 'none'; document.getElementById('2410.15556v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13841">arXiv:2410.13841</a> <span> [<a href="https://arxiv.org/pdf/2410.13841">pdf</a>, <a href="https://arxiv.org/format/2410.13841">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Unified View of Delta Parameter Editing in Post-Trained Large-Scale Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Q">Qiaoyu Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Le Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13841v1-abstract-short" style="display: inline;"> Post-training has emerged as a crucial paradigm for adapting large-scale pre-trained models to various tasks, whose effects are fully reflected by delta parameters (i.e., the disparity between post-trained and pre-trained parameters). While numerous studies have explored delta parameter properties via operations like pruning, quantization, low-rank approximation, and extrapolation, a unified frame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13841v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13841v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13841v1-abstract-full" style="display: none;"> Post-training has emerged as a crucial paradigm for adapting large-scale pre-trained models to various tasks, whose effects are fully reflected by delta parameters (i.e., the disparity between post-trained and pre-trained parameters). While numerous studies have explored delta parameter properties via operations like pruning, quantization, low-rank approximation, and extrapolation, a unified framework for systematically examining these characteristics has been lacking. In this paper, we propose a novel perspective based on Riemann sum approximation of the loss function to elucidate delta parameter editing operations. Our analysis categorizes existing methods into three classes based on their post-editing performance: competitive, decreased, and improved, explaining how they are expressed by the Riemann sum approximation term and how they alter the model performance. Extensive experiments on both visual and language models, including ViT, LLaMA 3, Qwen 2, and Mistral, corroborate our theoretical findings. Furthermore, we introduce extensions to existing techniques like DARE and BitDelta, highlighting their limitations in leveraging the properties of delta parameters and reorganizing them into general expressions to enhance the applicability and effectiveness of delta parameter editing in post-trained models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13841v1-abstract-full').style.display = 'none'; document.getElementById('2410.13841v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13264">arXiv:2410.13264</a> <span> [<a href="https://arxiv.org/pdf/2410.13264">pdf</a>, <a href="https://arxiv.org/format/2410.13264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Latent Road to Atoms: Backmapping Coarse-grained Protein Structures with Latent Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuancheng Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kang Liu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qiwei Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13264v1-abstract-short" style="display: inline;"> Coarse-grained(CG) molecular dynamics simulations offer computational efficiency for exploring protein conformational ensembles and thermodynamic properties. Though coarse representations enable large-scale simulations across extended temporal and spatial ranges, the sacrifice of atomic-level details limits their utility in tasks such as ligand docking and protein-protein interaction prediction. B… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13264v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13264v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13264v1-abstract-full" style="display: none;"> Coarse-grained(CG) molecular dynamics simulations offer computational efficiency for exploring protein conformational ensembles and thermodynamic properties. Though coarse representations enable large-scale simulations across extended temporal and spatial ranges, the sacrifice of atomic-level details limits their utility in tasks such as ligand docking and protein-protein interaction prediction. Backmapping, the process of reconstructing all-atom structures from coarse-grained representations, is crucial for recovering these fine details. While recent machine learning methods have made strides in protein structure generation, challenges persist in reconstructing diverse atomistic conformations that maintain geometric accuracy and chemical validity. In this paper, we present Latent Diffusion Backmapping (LDB), a novel approach leveraging denoising diffusion within latent space to address these challenges. By combining discrete latent encoding with diffusion, LDB bypasses the need for equivariant and internal coordinate manipulation, significantly simplifying the training and sampling processes as well as facilitating better and wider exploration in configuration space. We evaluate LDB's state-of-the-art performance on three distinct protein datasets, demonstrating its ability to efficiently reconstruct structures with high structural accuracy and chemical validity. Moreover, LDB shows exceptional versatility in capturing diverse protein ensembles, highlighting its capability to explore intricate conformational spaces. Our results position LDB as a powerful and scalable approach for backmapping, effectively bridging the gap between CG simulations and atomic-level analyses in computational biology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13264v1-abstract-full').style.display = 'none'; document.getElementById('2410.13264v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11499">arXiv:2410.11499</a> <span> [<a href="https://arxiv.org/pdf/2410.11499">pdf</a>, <a href="https://arxiv.org/format/2410.11499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BSM: Small but Powerful Biological Sequence Model for Genes and Proteins </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+W">Weixi Xiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xueting Han</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+X">Xiujuan Chai</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jing Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11499v1-abstract-short" style="display: inline;"> Modeling biological sequences such as DNA, RNA, and proteins is crucial for understanding complex processes like gene regulation and protein synthesis. However, most current models either focus on a single type or treat multiple types of data separately, limiting their ability to capture cross-modal relationships. We propose that by learning the relationships between these modalities, the model ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11499v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11499v1-abstract-full" style="display: none;"> Modeling biological sequences such as DNA, RNA, and proteins is crucial for understanding complex processes like gene regulation and protein synthesis. However, most current models either focus on a single type or treat multiple types of data separately, limiting their ability to capture cross-modal relationships. We propose that by learning the relationships between these modalities, the model can enhance its understanding of each type. To address this, we introduce BSM, a small but powerful mixed-modal biological sequence foundation model, trained on three types of data: RefSeq, Gene Related Sequences, and interleaved biological sequences from the web. These datasets capture the genetic flow, gene-protein relationships, and the natural co-occurrence of diverse biological data, respectively. By training on mixed-modal data, BSM significantly enhances learning efficiency and cross-modal representation, outperforming models trained solely on unimodal data. With only 110M parameters, BSM achieves performance comparable to much larger models across both single-modal and mixed-modal tasks, and uniquely demonstrates in-context learning capability for mixed-modal tasks, which is absent in existing models. Further scaling to 270M parameters demonstrates even greater performance gains, highlighting the potential of BSM as a significant advancement in multimodal biological sequence modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11499v1-abstract-full').style.display = 'none'; document.getElementById('2410.11499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11222">arXiv:2410.11222</a> <span> [<a href="https://arxiv.org/pdf/2410.11222">pdf</a>, <a href="https://arxiv.org/format/2410.11222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quadratic Gating Functions in Mixture of Experts: A Statistical Insight </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Akbarian%2C+P">Pedram Akbarian</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Huy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xing Han</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+N">Nhat Ho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11222v2-abstract-short" style="display: inline;"> Mixture of Experts (MoE) models are highly effective in scaling model capacity while preserving computational efficiency, with the gating network, or router, playing a central role by directing inputs to the appropriate experts. In this paper, we establish a novel connection between MoE frameworks and attention mechanisms, demonstrating how quadratic gating can serve as a more expressive and effic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11222v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11222v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11222v2-abstract-full" style="display: none;"> Mixture of Experts (MoE) models are highly effective in scaling model capacity while preserving computational efficiency, with the gating network, or router, playing a central role by directing inputs to the appropriate experts. In this paper, we establish a novel connection between MoE frameworks and attention mechanisms, demonstrating how quadratic gating can serve as a more expressive and efficient alternative. Motivated by this insight, we explore the implementation of quadratic gating within MoE models, identifying a connection between the self-attention mechanism and the quadratic gating. We conduct a comprehensive theoretical analysis of the quadratic softmax gating MoE framework, showing improved sample efficiency in expert and parameter estimation. Our analysis provides key insights into optimal designs for quadratic gating and expert functions, further elucidating the principles behind widely used attention mechanisms. Through extensive evaluations, we demonstrate that the quadratic gating MoE outperforms the traditional linear gating MoE. Moreover, our theoretical insights have guided the development of a novel attention mechanism, which we validated through extensive experiments. The results demonstrate its favorable performance over conventional models across various tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11222v2-abstract-full').style.display = 'none'; document.getElementById('2410.11222v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pedram Akbarian, Huy Nguyen, and Xing Han made equal contributions to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10594">arXiv:2410.10594</a> <span> [<a href="https://arxiv.org/pdf/2410.10594">pdf</a>, <a href="https://arxiv.org/format/2410.10594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chaoyue Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bokai Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Junbo Cui</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+J">Junhao Ran</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10594v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) is an effective technique that enables large language models (LLMs) to utilize external knowledge sources for generation. However, current RAG systems are solely based on text, rendering it impossible to utilize vision information like layout and images that play crucial roles in real-world multi-modality documents. In this paper, we introduce VisRAG, which tac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10594v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10594v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) is an effective technique that enables large language models (LLMs) to utilize external knowledge sources for generation. However, current RAG systems are solely based on text, rendering it impossible to utilize vision information like layout and images that play crucial roles in real-world multi-modality documents. In this paper, we introduce VisRAG, which tackles this issue by establishing a vision-language model (VLM)-based RAG pipeline. In this pipeline, instead of first parsing the document to obtain text, the document is directly embedded using a VLM as an image and then retrieved to enhance the generation of a VLM. Compared to traditional text-based RAG, VisRAG maximizes the retention and utilization of the data information in the original documents, eliminating the information loss introduced during the parsing process. We collect both open-source and synthetic data to train the retriever in VisRAG and explore a variety of generation methods. Experiments demonstrate that VisRAG outperforms traditional RAG in both the retrieval and generation stages, achieving a 25--39\% end-to-end performance gain over traditional text-based RAG pipeline. Further analysis reveals that VisRAG is effective in utilizing training data and demonstrates strong generalization capability, positioning it as a promising solution for RAG on multi-modality documents. Our code and data are available at https://github.com/openbmb/visrag . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10594v1-abstract-full').style.display = 'none'; document.getElementById('2410.10594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09342">arXiv:2410.09342</a> <span> [<a href="https://arxiv.org/pdf/2410.09342">pdf</a>, <a href="https://arxiv.org/format/2410.09342">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM$\times$MapReduce: Simplified Long-Sequence Processing using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+Y">Yu Chao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhili Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=An%2C+R">Rongqiao An</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Q">Qi Shi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhixing Tan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaodong Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09342v1-abstract-short" style="display: inline;"> Enlarging the context window of large language models (LLMs) has become a crucial research area, particularly for applications involving extremely long texts. In this work, we propose a novel training-free framework for processing long texts, utilizing a divide-and-conquer strategy to achieve comprehensive document understanding. The proposed LLM$\times$MapReduce framework splits the entire docume… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09342v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09342v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09342v1-abstract-full" style="display: none;"> Enlarging the context window of large language models (LLMs) has become a crucial research area, particularly for applications involving extremely long texts. In this work, we propose a novel training-free framework for processing long texts, utilizing a divide-and-conquer strategy to achieve comprehensive document understanding. The proposed LLM$\times$MapReduce framework splits the entire document into several chunks for LLMs to read and then aggregates the intermediate answers to produce the final output. The main challenge for divide-and-conquer long text processing frameworks lies in the risk of losing essential long-range information when splitting the document, which can lead the model to produce incomplete or incorrect answers based on the segmented texts. Disrupted long-range information can be classified into two categories: inter-chunk dependency and inter-chunk conflict. We design a structured information protocol to better cope with inter-chunk dependency and an in-context confidence calibration mechanism to resolve inter-chunk conflicts. Experimental results demonstrate that LLM$\times$MapReduce can outperform representative open-source and commercial long-context LLMs, and is applicable to several different models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09342v1-abstract-full').style.display = 'none'; document.getElementById('2410.09342v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress. Code: https://github.com/thunlp/LLMxMapReduce</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08821">arXiv:2410.08821</a> <span> [<a href="https://arxiv.org/pdf/2410.08821">pdf</a>, <a href="https://arxiv.org/format/2410.08821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Retriever-and-Memory: Towards Adaptive Note-Enhanced Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruobing Wang</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+D">Daren Zha</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08821v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) mitigates issues of the factual errors and hallucinated outputs generated by Large Language Models (LLMs) in open-domain question-answering tasks (OpenQA) via introducing external knowledge. For complex QA, however, existing RAG methods use LLMs to actively predict retrieval timing and directly use the retrieved information for generation, regardless of whether… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08821v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08821v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) mitigates issues of the factual errors and hallucinated outputs generated by Large Language Models (LLMs) in open-domain question-answering tasks (OpenQA) via introducing external knowledge. For complex QA, however, existing RAG methods use LLMs to actively predict retrieval timing and directly use the retrieved information for generation, regardless of whether the retrieval timing accurately reflects the actual information needs, or sufficiently considers prior retrieved knowledge, which may result in insufficient information gathering and interaction, yielding low-quality answers. To address these, we propose a generic RAG approach called Adaptive Note-Enhanced RAG (Adaptive-Note) for complex QA tasks, which includes the iterative information collector, adaptive memory reviewer, and task-oriented generator, while following a new Retriever-and-Memory paradigm. Specifically, Adaptive-Note introduces an overarching view of knowledge growth, iteratively gathering new information in the form of notes and updating them into the existing optimal knowledge structure, enhancing high-quality knowledge interactions. In addition, we employ an adaptive, note-based stop-exploration strategy to decide "what to retrieve and when to stop" to encourage sufficient knowledge exploration. We conduct extensive experiments on five complex QA datasets, and the results demonstrate the superiority and effectiveness of our method and its components. The code and data are at https://github.com/thunlp/Adaptive-Note. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08821v1-abstract-full').style.display = 'none'; document.getElementById('2410.08821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08815">arXiv:2410.08815</a> <span> [<a href="https://arxiv.org/pdf/2410.08815">pdf</a>, <a href="https://arxiv.org/format/2410.08815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StructRAG: Boosting Knowledge Intensive Reasoning of LLMs via Inference-time Hybrid Information Structurization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoqun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Q">Qiaoyu Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08815v2-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) is a key means to effectively enhance large language models (LLMs) in many knowledge-based tasks. However, existing RAG methods struggle with knowledge-intensive reasoning tasks, because useful information required to these tasks are badly scattered. This characteristic makes it difficult for existing RAG methods to accurately identify key information and perfo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08815v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08815v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08815v2-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) is a key means to effectively enhance large language models (LLMs) in many knowledge-based tasks. However, existing RAG methods struggle with knowledge-intensive reasoning tasks, because useful information required to these tasks are badly scattered. This characteristic makes it difficult for existing RAG methods to accurately identify key information and perform global reasoning with such noisy augmentation. In this paper, motivated by the cognitive theories that humans convert raw information into various structured knowledge when tackling knowledge-intensive reasoning, we proposes a new framework, StructRAG, which can identify the optimal structure type for the task at hand, reconstruct original documents into this structured format, and infer answers based on the resulting structure. Extensive experiments across various knowledge-intensive tasks show that StructRAG achieves state-of-the-art performance, particularly excelling in challenging scenarios, demonstrating its potential as an effective solution for enhancing LLMs in complex real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08815v2-abstract-full').style.display = 'none'; document.getElementById('2410.08815v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07693">arXiv:2410.07693</a> <span> [<a href="https://arxiv.org/pdf/2410.07693">pdf</a>, <a href="https://arxiv.org/format/2410.07693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Facet Counterfactual Learning for Content Quality Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiasheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+M">Meng Liao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07693v1-abstract-short" style="display: inline;"> Evaluating the quality of documents is essential for filtering valuable content from the current massive amount of information. Conventional approaches typically rely on a single score as a supervision signal for training content quality evaluators, which is inadequate to differentiate documents with quality variations across multiple facets. In this paper, we propose Multi-facet cOunterfactual LE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07693v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07693v1-abstract-full" style="display: none;"> Evaluating the quality of documents is essential for filtering valuable content from the current massive amount of information. Conventional approaches typically rely on a single score as a supervision signal for training content quality evaluators, which is inadequate to differentiate documents with quality variations across multiple facets. In this paper, we propose Multi-facet cOunterfactual LEarning (MOLE), a framework for efficiently constructing evaluators that perceive multiple facets of content quality evaluation. Given a specific scenario, we prompt large language models to generate counterfactual content that exhibits variations in critical quality facets compared to the original document. Furthermore, we leverage a joint training strategy based on contrastive learning and supervised learning to enable the evaluator to distinguish between different quality facets, resulting in more accurate predictions of content quality scores. Experimental results on 2 datasets across different scenarios demonstrate that our proposed MOLE framework effectively improves the correlation of document content quality evaluations with human judgments, which serve as a valuable toolkit for effective information acquisition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07693v1-abstract-full').style.display = 'none'; document.getElementById('2410.07693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07145">arXiv:2410.07145</a> <span> [<a href="https://arxiv.org/pdf/2410.07145">pdf</a>, <a href="https://arxiv.org/format/2410.07145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Stuffed Mamba: State Collapse and State Capacity of RNN-Based Long-Context Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingfa Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinrong Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengding Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07145v1-abstract-short" style="display: inline;"> One essential advantage of recurrent neural networks (RNNs) over transformer-based language models is their linear computational complexity concerning the sequence length, which makes them much faster in handling long sequences during inference. However, most publicly available RNNs (e.g., Mamba and RWKV) are trained on sequences with less than 10K tokens, and their effectiveness in longer context… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07145v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07145v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07145v1-abstract-full" style="display: none;"> One essential advantage of recurrent neural networks (RNNs) over transformer-based language models is their linear computational complexity concerning the sequence length, which makes them much faster in handling long sequences during inference. However, most publicly available RNNs (e.g., Mamba and RWKV) are trained on sequences with less than 10K tokens, and their effectiveness in longer contexts remains largely unsatisfying so far. In this paper, we study the cause of the inability to process long context for RNNs and suggest critical mitigations. We examine two practical concerns when applying state-of-the-art RNNs to long contexts: (1) the inability to extrapolate to inputs longer than the training length and (2) the upper bound of memory capacity. Addressing the first concern, we first investigate *state collapse* (SC), a phenomenon that causes severe performance degradation on sequence lengths not encountered during training. With controlled experiments, we attribute this to overfitting due to the recurrent state being overparameterized for the training length. For the second concern, we train a series of Mamba-2 models on long documents to empirically estimate the recurrent state capacity in language modeling and passkey retrieval. Then, three SC mitigation methods are proposed to improve Mamba-2's length generalizability, allowing the model to process more than 1M tokens without SC. We also find that the recurrent state capacity in passkey retrieval scales exponentially to the state size, and we empirically train a Mamba-2 370M with near-perfect passkey retrieval accuracy on 256K context length. This suggests a promising future for RNN-based long-context modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07145v1-abstract-full').style.display = 'none'; document.getElementById('2410.07145v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06802">arXiv:2410.06802</a> <span> [<a href="https://arxiv.org/pdf/2410.06802">pdf</a>, <a href="https://arxiv.org/format/2410.06802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Seg2Act: Global Context-aware Action Generation for Document Logical Structuring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zichao Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shaojie He</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+M">Meng Liao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanang Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yanxiong Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06802v1-abstract-short" style="display: inline;"> Document logical structuring aims to extract the underlying hierarchical structure of documents, which is crucial for document intelligence. Traditional approaches often fall short in handling the complexity and the variability of lengthy documents. To address these issues, we introduce Seg2Act, an end-to-end, generation-based method for document logical structuring, revisiting logical structure e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06802v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06802v1-abstract-full" style="display: none;"> Document logical structuring aims to extract the underlying hierarchical structure of documents, which is crucial for document intelligence. Traditional approaches often fall short in handling the complexity and the variability of lengthy documents. To address these issues, we introduce Seg2Act, an end-to-end, generation-based method for document logical structuring, revisiting logical structure extraction as an action generation task. Specifically, given the text segments of a document, Seg2Act iteratively generates the action sequence via a global context-aware generative model, and simultaneously updates its global context and current logical structure based on the generated actions. Experiments on ChCatExt and HierDoc datasets demonstrate the superior performance of Seg2Act in both supervised and transfer learning settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06802v1-abstract-full').style.display = 'none'; document.getElementById('2410.06802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05584">arXiv:2410.05584</a> <span> [<a href="https://arxiv.org/pdf/2410.05584">pdf</a>, <a href="https://arxiv.org/format/2410.05584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Reward Model Evaluation: Are We Barking up the Wrong Tree? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xueru Wen</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+J">Jie Lou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xing Yu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Debing Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05584v2-abstract-short" style="display: inline;"> Reward Models (RMs) are crucial for aligning language models with human preferences. Currently, the evaluation of RMs depends on measuring accuracy against a validation set of manually annotated preference data. Although this method is straightforward and widely adopted, the relationship between RM accuracy and downstream policy performance remains under-explored. In this work, we conduct experime… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05584v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05584v2-abstract-full" style="display: none;"> Reward Models (RMs) are crucial for aligning language models with human preferences. Currently, the evaluation of RMs depends on measuring accuracy against a validation set of manually annotated preference data. Although this method is straightforward and widely adopted, the relationship between RM accuracy and downstream policy performance remains under-explored. In this work, we conduct experiments in a synthetic setting to investigate how differences in RM measured by accuracy translate into gaps in optimized policy performance. Our findings reveal that while there is a weak positive correlation between accuracy and downstream performance, policies optimized towards RMs with similar accuracy can exhibit quite different performance. Moreover, we discover that the way of measuring accuracy significantly impacts its ability to predict the final policy performance. Through the lens of Regressional Goodhart's effect, we identify the existence of exogenous variables impacting the relationship between RM quality measured by accuracy and policy model capability. This underscores the inadequacy of relying solely on accuracy to reflect their impact on policy optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05584v2-abstract-full').style.display = 'none'; document.getElementById('2410.05584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03777">arXiv:2410.03777</a> <span> [<a href="https://arxiv.org/pdf/2410.03777">pdf</a>, <a href="https://arxiv.org/format/2410.03777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Determine-Then-Ensemble: Necessity of Top-k Union for Large Language Model Ensembling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuxuan Yao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Han Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingyang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Sichun Luo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiongwei Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03777v1-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit varying strengths and weaknesses across different tasks, prompting recent studies to explore the benefits of ensembling models to leverage their complementary advantages. However, existing LLM ensembling methods often overlook model compatibility and struggle with inefficient alignment of probabilities across the entire vocabulary. In this study, we empirically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03777v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03777v1-abstract-full" style="display: none;"> Large language models (LLMs) exhibit varying strengths and weaknesses across different tasks, prompting recent studies to explore the benefits of ensembling models to leverage their complementary advantages. However, existing LLM ensembling methods often overlook model compatibility and struggle with inefficient alignment of probabilities across the entire vocabulary. In this study, we empirically investigate the factors influencing ensemble performance, identifying model performance, vocabulary size, and response style as key determinants, revealing that compatibility among models is essential for effective ensembling. This analysis leads to the development of a simple yet effective model selection strategy that identifies compatible models. Additionally, we introduce the \textsc{Uni}on \textsc{T}op-$k$ \textsc{E}nsembling (\textsc{UniTE}), a novel approach that efficiently combines models by focusing on the union of the top-k tokens from each model, thereby avoiding the need for full vocabulary alignment and reducing computational overhead. Extensive evaluations across multiple benchmarks demonstrate that \textsc{UniTE} significantly enhances performance compared to existing methods, offering a more efficient framework for LLM ensembling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03777v1-abstract-full').style.display = 'none'; document.getElementById('2410.03777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03675">arXiv:2410.03675</a> <span> [<a href="https://arxiv.org/pdf/2410.03675">pdf</a>, <a href="https://arxiv.org/format/2410.03675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Controllable Shape Modeling with Neural Generalized Cylinder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhiqin Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruizhen Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03675v1-abstract-short" style="display: inline;"> Neural shape representation, such as neural signed distance field (NSDF), becomes more and more popular in shape modeling as its ability to deal with complex topology and arbitrary resolution. Due to the implicit manner to use features for shape representation, manipulating the shapes faces inherent challenge of inconvenience, since the feature cannot be intuitively edited. In this work, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03675v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03675v1-abstract-full" style="display: none;"> Neural shape representation, such as neural signed distance field (NSDF), becomes more and more popular in shape modeling as its ability to deal with complex topology and arbitrary resolution. Due to the implicit manner to use features for shape representation, manipulating the shapes faces inherent challenge of inconvenience, since the feature cannot be intuitively edited. In this work, we propose neural generalized cylinder (NGC) for explicit manipulation of NSDF, which is an extension of traditional generalized cylinder (GC). Specifically, we define a central curve first and assign neural features along the curve to represent the profiles. Then NSDF is defined on the relative coordinates of a specialized GC with oval-shaped profiles. By using the relative coordinates, NSDF can be explicitly controlled via manipulation of the GC. To this end, we apply NGC to many non-rigid deformation tasks like complex curved deformation, local scaling and twisting for shapes. The comparison on shape deformation with other methods proves the effectiveness and efficiency of NGC. Furthermore, NGC could utilize the neural feature for shape blending by a simple neural feature interpolation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03675v1-abstract-full').style.display = 'none'; document.getElementById('2410.03675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Siggraph Asia 2024 (Conference track)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03440">arXiv:2410.03440</a> <span> [<a href="https://arxiv.org/pdf/2410.03440">pdf</a>, <a href="https://arxiv.org/format/2410.03440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Benefit of Activation Sparsity in Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Q">Qiujieli Qin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03440v1-abstract-short" style="display: inline;"> Pre-trained Transformers inherently possess the characteristic of sparse activation, where only a small fraction of the neurons are activated for each token. While sparse activation has been explored through post-training methods, its potential in pre-training remains untapped. In this work, we first study how activation properties change during pre-training. Our examination reveals that Transform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03440v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03440v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03440v1-abstract-full" style="display: none;"> Pre-trained Transformers inherently possess the characteristic of sparse activation, where only a small fraction of the neurons are activated for each token. While sparse activation has been explored through post-training methods, its potential in pre-training remains untapped. In this work, we first study how activation properties change during pre-training. Our examination reveals that Transformers exhibit sparse activation throughout the majority of the pre-training process while the activation correlation keeps evolving as training progresses. Leveraging this observation, we propose Switchable Sparse-Dense Learning (SSD). SSD adaptively switches between the Mixtures-of-Experts (MoE) based sparse training and the conventional dense training during the pre-training process, leveraging the efficiency of sparse training and avoiding the static activation correlation of sparse training. Compared to dense training, SSD achieves comparable performance with identical model size and reduces pre-training costs. Moreover, the models trained with SSD can be directly used as MoE models for sparse inference and achieve the same performance as dense models with up to $2\times$ faster inference speed. Codes are available at https://github.com/thunlp/moefication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03440v1-abstract-full').style.display = 'none'; document.getElementById('2410.03440v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03439">arXiv:2410.03439</a> <span> [<a href="https://arxiv.org/pdf/2410.03439">pdf</a>, <a href="https://arxiv.org/format/2410.03439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ToolGen: Unified Tool Retrieval and Calling via Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Renxi Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xudong Han</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+L">Lei Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&query=Baldwin%2C+T">Timothy Baldwin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haonan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03439v2-abstract-short" style="display: inline;"> As large language models (LLMs) advance, their inability to autonomously execute tasks by directly interacting with external tools remains a critical limitation. Traditional methods rely on inputting tool descriptions as context, which is constrained by context length and requires separate, often inefficient, retrieval mechanisms. We introduce ToolGen, a paradigm shift that integrates tool knowled… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03439v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03439v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03439v2-abstract-full" style="display: none;"> As large language models (LLMs) advance, their inability to autonomously execute tasks by directly interacting with external tools remains a critical limitation. Traditional methods rely on inputting tool descriptions as context, which is constrained by context length and requires separate, often inefficient, retrieval mechanisms. We introduce ToolGen, a paradigm shift that integrates tool knowledge directly into the LLM's parameters by representing each tool as a unique token. This enables the LLM to generate tool calls and arguments as part of its next token prediction capabilities, seamlessly blending tool invocation with language generation. Our framework allows the LLM to access and utilize a vast amount of tools with no additional retrieval step, significantly enhancing both performance and scalability. Experimental results with over 47,000 tools show that ToolGen not only achieves superior results in both tool retrieval and autonomous task completion but also sets the stage for a new era of AI agents that can adapt to tools across diverse domains. By fundamentally transforming tool retrieval into a generative process, ToolGen paves the way for more versatile, efficient, and autonomous AI systems. ToolGen enables end-to-end tool learning and opens opportunities for integration with other advanced techniques such as chain-of-thought and reinforcement learning, thereby expanding the practical capabilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03439v2-abstract-full').style.display = 'none'; document.getElementById('2410.03439v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03159">arXiv:2410.03159</a> <span> [<a href="https://arxiv.org/pdf/2410.03159">pdf</a>, <a href="https://arxiv.org/format/2410.03159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Moving-average Attention Mechanism for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiecheng Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yan Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shihao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03159v1-abstract-short" style="display: inline;"> We propose an Autoregressive (AR) Moving-average (MA) attention structure that can adapt to various linear attention mechanisms, enhancing their ability to capture long-range and local temporal patterns in time series. In this paper, we first demonstrate that, for the time series forecasting (TSF) task, the previously overlooked decoder-only autoregressive Transformer model can achieve results com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03159v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03159v1-abstract-full" style="display: none;"> We propose an Autoregressive (AR) Moving-average (MA) attention structure that can adapt to various linear attention mechanisms, enhancing their ability to capture long-range and local temporal patterns in time series. In this paper, we first demonstrate that, for the time series forecasting (TSF) task, the previously overlooked decoder-only autoregressive Transformer model can achieve results comparable to the best baselines when appropriate tokenization and training methods are applied. Moreover, inspired by the ARMA model from statistics and recent advances in linear attention, we introduce the full ARMA structure into existing autoregressive attention mechanisms. By using an indirect MA weight generation method, we incorporate the MA term while maintaining the time complexity and parameter size of the underlying efficient attention models. We further explore how indirect parameter generation can produce implicit MA weights that align with the modeling requirements for local temporal impacts. Experimental results show that incorporating the ARMA structure consistently improves the performance of various AR attentions on TSF tasks, achieving state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03159v1-abstract-full').style.display = 'none'; document.getElementById('2410.03159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02935">arXiv:2410.02935</a> <span> [<a href="https://arxiv.org/pdf/2410.02935">pdf</a>, <a href="https://arxiv.org/format/2410.02935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Expert Estimation in Hierarchical Mixture of Experts: Beyond Softmax Gating Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Huy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xing Han</a>, <a href="/search/cs?searchtype=author&query=Harris%2C+C+W">Carl William Harris</a>, <a href="/search/cs?searchtype=author&query=Saria%2C+S">Suchi Saria</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+N">Nhat Ho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02935v1-abstract-short" style="display: inline;"> With the growing prominence of the Mixture of Experts (MoE) architecture in developing large-scale foundation models, we investigate the Hierarchical Mixture of Experts (HMoE), a specialized variant of MoE that excels in handling complex inputs and improving performance on targeted tasks. Our investigation highlights the advantages of using varied gating functions, moving beyond softmax gating wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02935v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02935v1-abstract-full" style="display: none;"> With the growing prominence of the Mixture of Experts (MoE) architecture in developing large-scale foundation models, we investigate the Hierarchical Mixture of Experts (HMoE), a specialized variant of MoE that excels in handling complex inputs and improving performance on targeted tasks. Our investigation highlights the advantages of using varied gating functions, moving beyond softmax gating within HMoE frameworks. We theoretically demonstrate that applying tailored gating functions to each expert group allows HMoE to achieve robust results, even when optimal gating functions are applied only at select hierarchical levels. Empirical validation across diverse scenarios supports these theoretical claims. This includes large-scale multimodal tasks, image classification, and latent domain discovery and prediction tasks, where our modified HMoE models show great performance improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02935v1-abstract-full').style.display = 'none'; document.getElementById('2410.02935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">58 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01805">arXiv:2410.01805</a> <span> [<a href="https://arxiv.org/pdf/2410.01805">pdf</a>, <a href="https://arxiv.org/format/2410.01805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Locret: Enhancing Eviction in Long-Context LLM Inference with Trained Retaining Heads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuxiang Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Binhang Yuan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01805v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable advances in supporting long-context comprehension and processing tasks. However, scaling the generation inference of LLMs to such long contexts incurs significant additional computation load, and demands a substantial GPU memory footprint to maintain the key-value (KV) cache of transformer-based LLMs. Existing KV cache compression methods, such as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01805v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01805v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable advances in supporting long-context comprehension and processing tasks. However, scaling the generation inference of LLMs to such long contexts incurs significant additional computation load, and demands a substantial GPU memory footprint to maintain the key-value (KV) cache of transformer-based LLMs. Existing KV cache compression methods, such as quantization, face memory bottlenecks as context length increases, while static-sized caches, such as eviction, suffer from inefficient policies. These limitations restrict deployment on consumer-grade devices like a single Nvidia 4090 GPU. To overcome this, we propose Locret, a framework for long-context LLM inference that introduces retaining heads to evaluate the causal importance of KV cache units, allowing for more accurate eviction within a fixed cache size. Locret is fine-tuned on top of the frozen backbone LLM using a minimal amount of data from standard long-context SFT datasets. During inference, we evict low-importance cache units along with a chunked prefill pattern, significantly reducing peak GPU memory usage. We conduct an extensive empirical study to evaluate Locret, where the experimental results show that Locret outperforms the recent competitive approaches, including InfLLM, Quantization, SirLLM, and MInference, in terms of memory efficiency and the quality of generated contents -- Locret achieves over a 20x and 8x KV cache compression ratio compared to the full KV cache for Phi-3-mini-128K and Llama-3.1-8B-instruct. Additionally, Locret can be combined with other methods, such as quantization and token merging. To our knowledge, Locret is the first framework capable of deploying Llama-3.1-8B or similar models on a single Nvidia 4090 GPU, enabling 128K long-context inference without compromising generation quality, and requiring little additional system optimizations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01805v1-abstract-full').style.display = 'none'; document.getElementById('2410.01805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprints</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01794">arXiv:2410.01794</a> <span> [<a href="https://arxiv.org/pdf/2410.01794">pdf</a>, <a href="https://arxiv.org/format/2410.01794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Loki: An Open-Source Tool for Fact Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haonan Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xudong Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxia Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minghan Wang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+R">Rui Xing</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yilin Geng</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Z">Zenan Zhai</a>, <a href="/search/cs?searchtype=author&query=Nakov%2C+P">Preslav Nakov</a>, <a href="/search/cs?searchtype=author&query=Baldwin%2C+T">Timothy Baldwin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01794v1-abstract-short" style="display: inline;"> We introduce Loki, an open-source tool designed to address the growing problem of misinformation. Loki adopts a human-centered approach, striking a balance between the quality of fact-checking and the cost of human involvement. It decomposes the fact-checking task into a five-step pipeline: breaking down long texts into individual claims, assessing their check-worthiness, generating queries, retri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01794v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01794v1-abstract-full" style="display: none;"> We introduce Loki, an open-source tool designed to address the growing problem of misinformation. Loki adopts a human-centered approach, striking a balance between the quality of fact-checking and the cost of human involvement. It decomposes the fact-checking task into a five-step pipeline: breaking down long texts into individual claims, assessing their check-worthiness, generating queries, retrieving evidence, and verifying the claims. Instead of fully automating the claim verification process, Loki provides essential information at each step to assist human judgment, especially for general users such as journalists and content moderators. Moreover, it has been optimized for latency, robustness, and cost efficiency at a commercially usable level. Loki is released under an MIT license and is available on GitHub. We also provide a video presenting the system and its capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01794v1-abstract-full').style.display = 'none'; document.getElementById('2410.01794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00773">arXiv:2410.00773</a> <span> [<a href="https://arxiv.org/pdf/2410.00773">pdf</a>, <a href="https://arxiv.org/format/2410.00773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BabelBench: An Omni Benchmark for Code-Driven Analysis of Multimodal and Multistructured Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuwu Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qiwen Cui</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yunzhe Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiran Wang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Z">Ziwei Chai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaotian Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jianbo Yuan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jing Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tingkai Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yufeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sirui Zheng</a>, <a href="/search/cs?searchtype=author&query=You%2C+Q">Quanzeng You</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00773v1-abstract-short" style="display: inline;"> Large language models (LLMs) have become increasingly pivotal across various domains, especially in handling complex data types. This includes structured data processing, as exemplified by ChartQA and ChatGPT-Ada, and multimodal unstructured data processing as seen in Visual Question Answering (VQA). These areas have attracted significant attention from both industry and academia. Despite this, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00773v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00773v1-abstract-full" style="display: none;"> Large language models (LLMs) have become increasingly pivotal across various domains, especially in handling complex data types. This includes structured data processing, as exemplified by ChartQA and ChatGPT-Ada, and multimodal unstructured data processing as seen in Visual Question Answering (VQA). These areas have attracted significant attention from both industry and academia. Despite this, there remains a lack of unified evaluation methodologies for these diverse data handling scenarios. In response, we introduce BabelBench, an innovative benchmark framework that evaluates the proficiency of LLMs in managing multimodal multistructured data with code execution. BabelBench incorporates a dataset comprising 247 meticulously curated problems that challenge the models with tasks in perception, commonsense reasoning, logical reasoning, and so on. Besides the basic capabilities of multimodal understanding, structured data processing as well as code generation, these tasks demand advanced capabilities in exploration, planning, reasoning and debugging. Our experimental findings on BabelBench indicate that even cutting-edge models like ChatGPT 4 exhibit substantial room for improvement. The insights derived from our comprehensive analysis offer valuable guidance for future research within the community. The benchmark data can be found at https://github.com/FFD8FFE/babelbench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00773v1-abstract-full').style.display = 'none'; document.getElementById('2410.00773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19370">arXiv:2409.19370</a> <span> [<a href="https://arxiv.org/pdf/2409.19370">pdf</a>, <a href="https://arxiv.org/ps/2409.19370">ps</a>, <a href="https://arxiv.org/format/2409.19370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MambaEviScrib: Mamba and Evidence-Guided Consistency Enhance CNN Robustness for Scribble-Based Weakly Supervised Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoxiang Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyu Li</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jiang Shang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiman Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyan Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19370v2-abstract-short" style="display: inline;"> Segmenting anatomical structures and lesions from ultrasound images contributes to disease assessment. Weakly supervised learning (WSL) based on sparse annotation has achieved encouraging performance and demonstrated the potential to reduce annotation costs. This study attempts to introduce scribble-based WSL into ultrasound image segmentation tasks. However, ultrasound images often suffer from po… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19370v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19370v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19370v2-abstract-full" style="display: none;"> Segmenting anatomical structures and lesions from ultrasound images contributes to disease assessment. Weakly supervised learning (WSL) based on sparse annotation has achieved encouraging performance and demonstrated the potential to reduce annotation costs. This study attempts to introduce scribble-based WSL into ultrasound image segmentation tasks. However, ultrasound images often suffer from poor contrast and unclear edges, coupled with insufficient supervison signals for edges, posing challenges to edge prediction. Uncertainty modeling has been proven to facilitate models in dealing with these issues. Nevertheless, existing uncertainty estimation paradigms are not robust enough and often filter out predictions near decision boundaries, resulting in unstable edge predictions. Therefore, we propose leveraging predictions near decision boundaries effectively. Specifically, we introduce Dempster-Shafer Theory (DST) of evidence to design an Evidence-Guided Consistency strategy. This strategy utilizes high-evidence predictions, which are more likely to occur near high-density regions, to guide the optimization of low-evidence predictions that may appear near decision boundaries. Furthermore, the diverse sizes and locations of lesions in ultrasound images pose a challenge for CNNs with local receptive fields, as they struggle to model global information. Therefore, we introduce Visual Mamba based on structured state space sequence models, which achieves long-range dependency with linear computational complexity, and we construct a novel hybrid CNN-Mamba framework. During training, the collaboration between the CNN branch and the Mamba branch in the proposed framework draws inspiration from each other based on the EGC strategy. Experiments demonstrate the competitiveness of the proposed method. Dataset and code will be available on https://github.com/GtLinyer/MambaEviScrib. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19370v2-abstract-full').style.display = 'none'; document.getElementById('2409.19370v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16863">arXiv:2409.16863</a> <span> [<a href="https://arxiv.org/pdf/2409.16863">pdf</a>, <a href="https://arxiv.org/format/2409.16863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Unified 3D Hair Reconstruction from Single-View Portraits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujian Zheng</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yuda Qiu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Leyang Jin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chongyang Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haibin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16863v1-abstract-short" style="display: inline;"> Single-view 3D hair reconstruction is challenging, due to the wide range of shape variations among diverse hairstyles. Current state-of-the-art methods are specialized in recovering un-braided 3D hairs and often take braided styles as their failure cases, because of the inherent difficulty to define priors for complex hairstyles, whether rule-based or data-based. We propose a novel strategy to ena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16863v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16863v1-abstract-full" style="display: none;"> Single-view 3D hair reconstruction is challenging, due to the wide range of shape variations among diverse hairstyles. Current state-of-the-art methods are specialized in recovering un-braided 3D hairs and often take braided styles as their failure cases, because of the inherent difficulty to define priors for complex hairstyles, whether rule-based or data-based. We propose a novel strategy to enable single-view 3D reconstruction for a variety of hair types via a unified pipeline. To achieve this, we first collect a large-scale synthetic multi-view hair dataset SynMvHair with diverse 3D hair in both braided and un-braided styles, and learn two diffusion priors specialized on hair. Then we optimize 3D Gaussian-based hair from the priors with two specially designed modules, i.e. view-wise and pixel-wise Gaussian refinement. Our experiments demonstrate that reconstructing braided and un-braided 3D hair from single-view images via a unified approach is possible and our method achieves the state-of-the-art performance in recovering complex hairstyles. It is worth to mention that our method shows good generalization ability to real images, although it learns hair priors from synthetic data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16863v1-abstract-full').style.display = 'none'; document.getElementById('2409.16863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SIGGRAPH Asia 2024, project page: https://unihair24.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16827">arXiv:2409.16827</a> <span> [<a href="https://arxiv.org/pdf/2409.16827">pdf</a>, <a href="https://arxiv.org/format/2409.16827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Focus Entirety and Perceive Environment for Arbitrary-Shaped Text Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16827v1-abstract-short" style="display: inline;"> Due to the diversity of scene text in aspects such as font, color, shape, and size, accurately and efficiently detecting text is still a formidable challenge. Among the various detection approaches, segmentation-based approaches have emerged as prominent contenders owing to their flexible pixel-level predictions. However, these methods typically model text instances in a bottom-up manner, which is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16827v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16827v1-abstract-full" style="display: none;"> Due to the diversity of scene text in aspects such as font, color, shape, and size, accurately and efficiently detecting text is still a formidable challenge. Among the various detection approaches, segmentation-based approaches have emerged as prominent contenders owing to their flexible pixel-level predictions. However, these methods typically model text instances in a bottom-up manner, which is highly susceptible to noise. In addition, the prediction of pixels is isolated without introducing pixel-feature interaction, which also influences the detection performance. To alleviate these problems, we propose a multi-information level arbitrary-shaped text detector consisting of a focus entirety module (FEM) and a perceive environment module (PEM). The former extracts instance-level features and adopts a top-down scheme to model texts to reduce the influence of noises. Specifically, it assigns consistent entirety information to pixels within the same instance to improve their cohesion. In addition, it emphasizes the scale information, enabling the model to distinguish varying scale texts effectively. The latter extracts region-level information and encourages the model to focus on the distribution of positive samples in the vicinity of a pixel, which perceives environment information. It treats the kernel pixels as positive samples and helps the model differentiate text and kernel features. Extensive experiments demonstrate the FEM's ability to efficiently support the model in handling different scale texts and confirm the PEM can assist in perceiving pixels more accurately by focusing on pixel vicinities. Comparisons show the proposed model outperforms existing state-of-the-art approaches on four public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16827v1-abstract-full').style.display = 'none'; document.getElementById('2409.16827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16820">arXiv:2409.16820</a> <span> [<a href="https://arxiv.org/pdf/2409.16820">pdf</a>, <a href="https://arxiv.org/format/2409.16820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spotlight Text Detector: Spotlight on Candidate Regions Like a Camera </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16820v1-abstract-short" style="display: inline;"> The irregular contour representation is one of the tough challenges in scene text detection. Although segmentation-based methods have achieved significant progress with the help of flexible pixel prediction, the overlap of geographically close texts hinders detecting them separately. To alleviate this problem, some shrink-based methods predict text kernels and expand them to restructure texts. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16820v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16820v1-abstract-full" style="display: none;"> The irregular contour representation is one of the tough challenges in scene text detection. Although segmentation-based methods have achieved significant progress with the help of flexible pixel prediction, the overlap of geographically close texts hinders detecting them separately. To alleviate this problem, some shrink-based methods predict text kernels and expand them to restructure texts. However, the text kernel is an artificial object with incomplete semantic features that are prone to incorrect or missing detection. In addition, different from the general objects, the geometry features (aspect ratio, scale, and shape) of scene texts vary significantly, which makes it difficult to detect them accurately. To consider the above problems, we propose an effective spotlight text detector (STD), which consists of a spotlight calibration module (SCM) and a multivariate information extraction module (MIEM). The former concentrates efforts on the candidate kernel, like a camera focus on the target. It obtains candidate features through a mapping filter and calibrates them precisely to eliminate some false positive samples. The latter designs different shape schemes to explore multiple geometric features for scene texts. It helps extract various spatial relationships to improve the model's ability to recognize kernel regions. Ablation studies prove the effectiveness of the designed SCM and MIEM. Extensive experiments verify that our STD is superior to existing state-of-the-art methods on various datasets, including ICDAR2015, CTW1500, MSRA-TD500, and Total-Text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16820v1-abstract-full').style.display = 'none'; document.getElementById('2409.16820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>