Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 156 results for author: <span class="mathjax">Ye, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Ye%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ye, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ye%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ye, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ye%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13130">arXiv:2502.13130</a> <span> [<a href="https://arxiv.org/pdf/2502.13130">pdf</a>, <a href="https://arxiv.org/format/2502.13130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Magma: A Foundation Model for Multimodal AI Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+R">Reuben Tan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qianhui Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+R">Ruijie Zheng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Baolin Peng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yongyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yu Gu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+M">Mu Cai</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuquan Deng</a>, <a href="/search/cs?searchtype=author&query=Liden%2C+L">Lars Liden</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13130v1-abstract-short" style="display: inline;"> We present Magma, a foundation model that serves multimodal AI agentic tasks in both the digital and physical worlds. Magma is a significant extension of vision-language (VL) models in that it not only retains the VL understanding ability (verbal intelligence) of the latter, but is also equipped with the ability to plan and act in the visual-spatial world (spatial-temporal intelligence) and comple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13130v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13130v1-abstract-full" style="display: none;"> We present Magma, a foundation model that serves multimodal AI agentic tasks in both the digital and physical worlds. Magma is a significant extension of vision-language (VL) models in that it not only retains the VL understanding ability (verbal intelligence) of the latter, but is also equipped with the ability to plan and act in the visual-spatial world (spatial-temporal intelligence) and complete agentic tasks ranging from UI navigation to robot manipulation. To endow the agentic capabilities, Magma is pretrained on large amounts of heterogeneous datasets spanning from images, videos to robotics data, where the actionable visual objects (e.g., clickable buttons in GUI) in images are labeled by Set-of-Mark (SoM) for action grounding, and the object movements (e.g., the trace of human hands or robotic arms) in videos are labeled by Trace-of-Mark (ToM) for action planning. Extensive experiments show that SoM and ToM reach great synergy and facilitate the acquisition of spatial-temporal intelligence for our Magma model, which is fundamental to a wide range of tasks as shown in Fig.1. In particular, Magma creates new state-of-the-art results on UI navigation and robotic manipulation tasks, outperforming previous models that are specifically tailored to these tasks. On image and video-related multimodal tasks, Magma also compares favorably to popular large multimodal models that are trained on much larger datasets. We make our model and code public for reproducibility at https://microsoft.github.io/Magma. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13130v1-abstract-full').style.display = 'none'; document.getElementById('2502.13130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 16 figures, technical report from MSR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07598">arXiv:2502.07598</a> <span> [<a href="https://arxiv.org/pdf/2502.07598">pdf</a>, <a href="https://arxiv.org/format/2502.07598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Towards spatial computing: recent advances in multimodal natural interaction for XR headsets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhimin Wang</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+M">Maohang Rao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shanghua Ye</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Weitao Song</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+F">Feng Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07598v1-abstract-short" style="display: inline;"> With the widespread adoption of Extended Reality (XR) headsets, spatial computing technologies are gaining increasing attention. Spatial computing enables interaction with virtual elements through natural input methods such as eye tracking, hand gestures, and voice commands, thus placing natural human-computer interaction at its core. While previous surveys have reviewed conventional XR interactio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07598v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07598v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07598v1-abstract-full" style="display: none;"> With the widespread adoption of Extended Reality (XR) headsets, spatial computing technologies are gaining increasing attention. Spatial computing enables interaction with virtual elements through natural input methods such as eye tracking, hand gestures, and voice commands, thus placing natural human-computer interaction at its core. While previous surveys have reviewed conventional XR interaction techniques, recent advancements in natural interaction, particularly driven by artificial intelligence (AI) and large language models (LLMs), have introduced new paradigms and technologies. In this paper, we review research on multimodal natural interaction for wearable XR, focusing on papers published between 2022 and 2024 in six top venues: ACM CHI, UIST, IMWUT (Ubicomp), IEEE VR, ISMAR, and TVCG. We classify and analyze these studies based on application scenarios, operation types, and interaction modalities. This analysis provides a structured framework for understanding how researchers are designing advanced natural interaction techniques in XR. Based on these findings, we discuss the challenges in natural interaction techniques and suggest potential directions for future research. This review provides valuable insights for researchers aiming to design natural and efficient interaction systems for XR, ultimately contributing to the advancement of spatial computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07598v1-abstract-full').style.display = 'none'; document.getElementById('2502.07598v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15096">arXiv:2501.15096</a> <span> [<a href="https://arxiv.org/pdf/2501.15096">pdf</a>, <a href="https://arxiv.org/format/2501.15096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Better Robustness: Progressively Joint Pose-3DGS Learning for Arbitrarily Long Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhen-Hui Dong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sheng Ye</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yu-Hui Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nannan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong-Jin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15096v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has emerged as a powerful representation due to its efficiency and high-fidelity rendering. However, 3DGS training requires a known camera pose for each input view, typically obtained by Structure-from-Motion (SfM) pipelines. Pioneering works have attempted to relax this restriction but still face difficulties when handling long sequences with complex camera trajectori… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15096v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15096v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has emerged as a powerful representation due to its efficiency and high-fidelity rendering. However, 3DGS training requires a known camera pose for each input view, typically obtained by Structure-from-Motion (SfM) pipelines. Pioneering works have attempted to relax this restriction but still face difficulties when handling long sequences with complex camera trajectories. In this work, we propose Rob-GS, a robust framework to progressively estimate camera poses and optimize 3DGS for arbitrarily long video sequences. Leveraging the inherent continuity of videos, we design an adjacent pose tracking method to ensure stable pose estimation between consecutive frames. To handle arbitrarily long inputs, we adopt a "divide and conquer" scheme that adaptively splits the video sequence into several segments and optimizes them separately. Extensive experiments on the Tanks and Temples dataset and our collected real-world dataset show that our Rob-GS outperforms the state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15096v1-abstract-full').style.display = 'none'; document.getElementById('2501.15096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10658">arXiv:2501.10658</a> <span> [<a href="https://arxiv.org/pdf/2501.10658">pdf</a>, <a href="https://arxiv.org/format/2501.10658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning Accelerator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoyu Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyu Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chunyun Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Ting Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/cs?searchtype=author&query=Sabry%2C+M+M">Mohamed M. Sabry</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10658v1-abstract-short" style="display: inline;"> The emergence of neural network capabilities invariably leads to a significant surge in computational demands due to expanding model sizes and increased computational complexity. To reduce model size and lower inference costs, recent research has focused on simplifying models and designing hardware accelerators using low-bit quantization. However, due to numerical representation limits, scalar qua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10658v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10658v1-abstract-full" style="display: none;"> The emergence of neural network capabilities invariably leads to a significant surge in computational demands due to expanding model sizes and increased computational complexity. To reduce model size and lower inference costs, recent research has focused on simplifying models and designing hardware accelerators using low-bit quantization. However, due to numerical representation limits, scalar quantization cannot reduce bit width lower than 1-bit, diminishing its benefits. To break through these limitations, we introduce LUT-DLA, a Look-Up Table (LUT) Deep Learning Accelerator Framework that utilizes vector quantization to convert neural network models into LUTs, achieving extreme low-bit quantization. The LUT-DLA framework facilitates efficient and cost-effective hardware accelerator designs and supports the LUTBoost algorithm, which helps to transform various DNN models into LUT-based models via multistage training, drastically cutting both computational and hardware overhead. Additionally, through co-design space exploration, LUT-DLA assesses the impact of various model and hardware parameters to fine-tune hardware configurations for different application scenarios, optimizing performance and efficiency. Our comprehensive experiments show that LUT-DLA achieves improvements in power efficiency and area efficiency with gains of $1.4$~$7.0\times$ and $1.5$~$146.1\times$, respectively, while maintaining only a modest accuracy drop. For CNNs, accuracy decreases by $0.1\%$~$3.1\%$ using the $L_2$ distance similarity, $0.1\%$~$3.4\%$ with the $L_1$ distance similarity, and $0.1\%$~$3.8\%$ when employing the Chebyshev distance similarity. For transformer-based models, the accuracy drop ranges from $1.4\%$ to $3.0\%$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10658v1-abstract-full').style.display = 'none'; document.getElementById('2501.10658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07297">arXiv:2501.07297</a> <span> [<a href="https://arxiv.org/pdf/2501.07297">pdf</a>, <a href="https://arxiv.org/format/2501.07297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Toward Realistic Camouflaged Object Detection: Benchmarks and Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+Z">Zhimeng Xin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianxu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuo Ye</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zijing Xie</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yixiong Zou</a>, <a href="/search/cs?searchtype=author&query=You%2C+X">Xinge You</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yufei Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07297v1-abstract-short" style="display: inline;"> Camouflaged object detection (COD) primarily relies on semantic or instance segmentation methods. While these methods have made significant advancements in identifying the contours of camouflaged objects, they may be inefficient or cost-effective for tasks that only require the specific location of the object. Object detection algorithms offer an optimized solution for Realistic Camouflaged Object… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07297v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07297v1-abstract-full" style="display: none;"> Camouflaged object detection (COD) primarily relies on semantic or instance segmentation methods. While these methods have made significant advancements in identifying the contours of camouflaged objects, they may be inefficient or cost-effective for tasks that only require the specific location of the object. Object detection algorithms offer an optimized solution for Realistic Camouflaged Object Detection (RCOD) in such cases. However, detecting camouflaged objects remains a formidable challenge due to the high degree of similarity between the features of the objects and their backgrounds. Unlike segmentation methods that perform pixel-wise comparisons to differentiate between foreground and background, object detectors omit this analysis, further aggravating the challenge. To solve this problem, we propose a camouflage-aware feature refinement (CAFR) strategy. Since camouflaged objects are not rare categories, CAFR fully utilizes a clear perception of the current object within the prior knowledge of large models to assist detectors in deeply understanding the distinctions between background and foreground. Specifically, in CAFR, we introduce the Adaptive Gradient Propagation (AGP) module that fine-tunes all feature extractor layers in large detection models to fully refine class-specific features from camouflaged contexts. We then design the Sparse Feature Refinement (SFR) module that optimizes the transformer-based feature extractor to focus primarily on capturing class-specific features in camouflaged scenarios. To facilitate the assessment of RCOD tasks, we manually annotate the labels required for detection on three existing segmentation COD datasets, creating a new benchmark for RCOD tasks. Code and datasets are available at: https://github.com/zhimengXin/RCOD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07297v1-abstract-full').style.display = 'none'; document.getElementById('2501.07297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01028">arXiv:2501.01028</a> <span> [<a href="https://arxiv.org/pdf/2501.01028">pdf</a>, <a href="https://arxiv.org/format/2501.01028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinshuo Hu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zifei Shan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinping Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zetian Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongfang Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shaolin Ye</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinyuan Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Baotian Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofen Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jun Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01028v4-abstract-short" style="display: inline;"> As retrieval-augmented generation prevails in large language models, embedding models are becoming increasingly crucial. Despite the growing number of general embedding models, prior work often overlooks the critical role of training data quality. In this work, we introduce KaLM-Embedding, a general multilingual embedding model that leverages a large quantity of cleaner, more diverse, and domain-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01028v4-abstract-full').style.display = 'inline'; document.getElementById('2501.01028v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01028v4-abstract-full" style="display: none;"> As retrieval-augmented generation prevails in large language models, embedding models are becoming increasingly crucial. Despite the growing number of general embedding models, prior work often overlooks the critical role of training data quality. In this work, we introduce KaLM-Embedding, a general multilingual embedding model that leverages a large quantity of cleaner, more diverse, and domain-specific training data. Our model has been trained with key techniques proven to enhance performance: (1) persona-based synthetic data to create diversified examples distilled from LLMs, (2) ranking consistency filtering to remove less informative samples, and (3) semi-homogeneous task batch sampling to improve training efficacy. Departing from traditional BERT-like architectures, we adopt Qwen2-0.5B as the pre-trained model, facilitating the adaptation of auto-regressive language models for general embedding tasks. Extensive evaluations of the MTEB benchmark across multiple languages show that our model outperforms others of comparable size, setting a new standard for multilingual embedding models with <1B parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01028v4-abstract-full').style.display = 'none'; document.getElementById('2501.01028v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report. 23 pages, 6 figures, 10 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19437">arXiv:2412.19437</a> <span> [<a href="https://arxiv.org/pdf/2412.19437">pdf</a>, <a href="https://arxiv.org/format/2412.19437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V3 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+F">Fucong Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19437v2-abstract-short" style="display: inline;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for loa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19437v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19437v2-abstract-full" style="display: none;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'none'; document.getElementById('2412.19437v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18483">arXiv:2412.18483</a> <span> [<a href="https://arxiv.org/pdf/2412.18483">pdf</a>, <a href="https://arxiv.org/format/2412.18483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A region-wide, multi-year set of crop field boundary labels for Africa </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Estes%2C+L+D">L. D. Estes</a>, <a href="/search/cs?searchtype=author&query=Wussah%2C+A">A. Wussah</a>, <a href="/search/cs?searchtype=author&query=Asipunu%2C+M">M. Asipunu</a>, <a href="/search/cs?searchtype=author&query=Gathigi%2C+M">M. Gathigi</a>, <a href="/search/cs?searchtype=author&query=Kova%C4%8Di%C4%8D%2C+P">P. Kova膷i膷</a>, <a href="/search/cs?searchtype=author&query=Muhando%2C+J">J. Muhando</a>, <a href="/search/cs?searchtype=author&query=Yeboah%2C+B+V">B. V. Yeboah</a>, <a href="/search/cs?searchtype=author&query=Addai%2C+F+K">F. K. Addai</a>, <a href="/search/cs?searchtype=author&query=Akakpo%2C+E+S">E. S. Akakpo</a>, <a href="/search/cs?searchtype=author&query=Allotey%2C+M+K">M. K. Allotey</a>, <a href="/search/cs?searchtype=author&query=Amkoya%2C+P">P. Amkoya</a>, <a href="/search/cs?searchtype=author&query=Amponsem%2C+E">E. Amponsem</a>, <a href="/search/cs?searchtype=author&query=Donkoh%2C+K+D">K. D. Donkoh</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+N">N. Ha</a>, <a href="/search/cs?searchtype=author&query=Heltzel%2C+E">E. Heltzel</a>, <a href="/search/cs?searchtype=author&query=Juma%2C+C">C. Juma</a>, <a href="/search/cs?searchtype=author&query=Mdawida%2C+R">R. Mdawida</a>, <a href="/search/cs?searchtype=author&query=Miroyo%2C+A">A. Miroyo</a>, <a href="/search/cs?searchtype=author&query=Mucha%2C+J">J. Mucha</a>, <a href="/search/cs?searchtype=author&query=Mugami%2C+J">J. Mugami</a>, <a href="/search/cs?searchtype=author&query=Mwawaza%2C+F">F. Mwawaza</a>, <a href="/search/cs?searchtype=author&query=Nyarko%2C+D+A">D. A. Nyarko</a>, <a href="/search/cs?searchtype=author&query=Oduor%2C+P">P. Oduor</a>, <a href="/search/cs?searchtype=author&query=Ohemeng%2C+K+N">K. N. Ohemeng</a>, <a href="/search/cs?searchtype=author&query=Segbefia%2C+S+I+D">S. I. D. Segbefia</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18483v1-abstract-short" style="display: inline;"> African agriculture is undergoing rapid transformation. Annual maps of crop fields are key to understanding the nature of this transformation, but such maps are currently lacking and must be developed using advanced machine learning models trained on high resolution remote sensing imagery. To enable the development of such models, we delineated field boundaries in 33,746 Planet images captured bet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18483v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18483v1-abstract-full" style="display: none;"> African agriculture is undergoing rapid transformation. Annual maps of crop fields are key to understanding the nature of this transformation, but such maps are currently lacking and must be developed using advanced machine learning models trained on high resolution remote sensing imagery. To enable the development of such models, we delineated field boundaries in 33,746 Planet images captured between 2017 and 2023 across the continent using a custom labeling platform with built-in procedures for assessing and mitigating label error. We collected 42,403 labels, including 7,204 labels arising from tasks dedicated to assessing label quality (Class 1 labels), 32,167 from sites mapped once by a single labeller (Class 2) and 3,032 labels from sites where 3 or more labellers were tasked to map the same location (Class 4). Class 1 labels were used to calculate labeller-specific quality scores, while Class 1 and 4 sites mapped by at least 3 labellers were used to further evaluate label uncertainty using a Bayesian risk metric. Quality metrics showed that label quality was moderately high (0.75) for measures of total field extent, but low regarding the number of individual fields delineated (0.33), and the position of field edges (0.05). These values are expected when delineating small-scale fields in 3-5 m resolution imagery, which can be too coarse to reliably distinguish smaller fields, particularly in dense croplands, and therefore requires substantial labeller judgement. Nevertheless, previous work shows that such labels can train effective field mapping models. Furthermore, this large, probabilistic sample on its own provides valuable insight into regional agricultural characteristics, highlighting variations in the median field size and density. The imagery and vectorized labels along with quality information is available for download from two public repositories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18483v1-abstract-full').style.display = 'none'; document.getElementById('2412.18483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17847">arXiv:2412.17847</a> <span> [<a href="https://arxiv.org/pdf/2412.17847">pdf</a>, <a href="https://arxiv.org/format/2412.17847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Data Provenance Gap Across Text, Speech and Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+N">Nikhil Singh</a>, <a href="/search/cs?searchtype=author&query=Cherep%2C+M">Manuel Cherep</a>, <a href="/search/cs?searchtype=author&query=Tiwary%2C+K">Kushagra Tiwary</a>, <a href="/search/cs?searchtype=author&query=Materzynska%2C+J">Joanna Materzynska</a>, <a href="/search/cs?searchtype=author&query=Brannon%2C+W">William Brannon</a>, <a href="/search/cs?searchtype=author&query=Mahari%2C+R">Robert Mahari</a>, <a href="/search/cs?searchtype=author&query=Obeng-Marnu%2C+N">Naana Obeng-Marnu</a>, <a href="/search/cs?searchtype=author&query=Dey%2C+M">Manan Dey</a>, <a href="/search/cs?searchtype=author&query=Hamdy%2C+M">Mohammed Hamdy</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+N">Nayan Saxena</a>, <a href="/search/cs?searchtype=author&query=Anis%2C+A+M">Ahmad Mustafa Anis</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E+A">Emad A. Alghamdi</a>, <a href="/search/cs?searchtype=author&query=Chien%2C+V+M">Vu Minh Chien</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+K">Kun Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+M">Minnie Liang</a>, <a href="/search/cs?searchtype=author&query=Dinh%2C+A">An Dinh</a>, <a href="/search/cs?searchtype=author&query=Mohanty%2C+S">Shrestha Mohanty</a>, <a href="/search/cs?searchtype=author&query=Mataciunas%2C+D">Deividas Mataciunas</a>, <a href="/search/cs?searchtype=author&query=South%2C+T">Tobin South</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A+N">Ariel N. Lee</a>, <a href="/search/cs?searchtype=author&query=Lund%2C+C+S">Campbell S. Lund</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17847v2-abstract-short" style="display: inline;"> Progress in AI is driven largely by the scale and quality of training data. Despite this, there is a deficit of empirical analysis examining the attributes of well-established datasets beyond text. In this work we conduct the largest and first-of-its-kind longitudinal audit across modalities--popular text, speech, and video datasets--from their detailed sourcing trends and use restrictions to thei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17847v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17847v2-abstract-full" style="display: none;"> Progress in AI is driven largely by the scale and quality of training data. Despite this, there is a deficit of empirical analysis examining the attributes of well-established datasets beyond text. In this work we conduct the largest and first-of-its-kind longitudinal audit across modalities--popular text, speech, and video datasets--from their detailed sourcing trends and use restrictions to their geographical and linguistic representation. Our manual analysis covers nearly 4000 public datasets between 1990-2024, spanning 608 languages, 798 sources, 659 organizations, and 67 countries. We find that multimodal machine learning applications have overwhelmingly turned to web-crawled, synthetic, and social media platforms, such as YouTube, for their training sets, eclipsing all other sources since 2019. Secondly, tracing the chain of dataset derivations we find that while less than 33% of datasets are restrictively licensed, over 80% of the source content in widely-used text, speech, and video datasets, carry non-commercial restrictions. Finally, counter to the rising number of languages and geographies represented in public AI training datasets, our audit demonstrates measures of relative geographical and multilingual representation have failed to significantly improve their coverage since 2013. We believe the breadth of our audit enables us to empirically examine trends in data sourcing, restrictions, and Western-centricity at an ecosystem-level, and that visibility into these questions are essential to progress in responsible AI. As a contribution to ongoing improvements in dataset transparency and responsible use, we release our entire multimodal audit, allowing practitioners to trace data provenance across text, speech, and video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17847v2-abstract-full').style.display = 'none'; document.getElementById('2412.17847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025. 10 pages, 5 figures (main paper)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15634">arXiv:2412.15634</a> <span> [<a href="https://arxiv.org/pdf/2412.15634">pdf</a>, <a href="https://arxiv.org/format/2412.15634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Darkit: A User-Friendly Software Toolkit for Spiking Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xin Du</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shifan Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yangfan Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Shunyu Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuyang Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shuiguang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15634v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been widely applied in various practical applications, typically comprising billions of parameters, with inference processes requiring substantial energy and computational resources. In contrast, the human brain, employing bio-plausible spiking mechanisms, can accomplish the same tasks while significantly reducing energy consumption, even with a similar number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15634v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15634v1-abstract-full" style="display: none;"> Large language models (LLMs) have been widely applied in various practical applications, typically comprising billions of parameters, with inference processes requiring substantial energy and computational resources. In contrast, the human brain, employing bio-plausible spiking mechanisms, can accomplish the same tasks while significantly reducing energy consumption, even with a similar number of parameters. Based on this, several pioneering researchers have proposed and implemented various large language models that leverage spiking neural networks. They have demonstrated the feasibility of these models, validated their performance, and open-sourced their frameworks and partial source code. To accelerate the adoption of brain-inspired large language models and facilitate secondary development for researchers, we are releasing a software toolkit named DarwinKit (Darkit). The toolkit is designed specifically for learners, researchers, and developers working on spiking large models, offering a suite of highly user-friendly features that greatly simplify the learning, deployment, and development processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15634v1-abstract-full').style.display = 'none'; document.getElementById('2412.15634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13533">arXiv:2412.13533</a> <span> [<a href="https://arxiv.org/pdf/2412.13533">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Language-guided Medical Image Segmentation with Target-informed Multi-level Contrastive Alignments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjian Li</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+M">Mingyuan Meng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuchang Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D+D">David Dagan Feng</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+L">Lei Bi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinman Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13533v1-abstract-short" style="display: inline;"> Medical image segmentation is crucial in modern medical image analysis, which can aid into diagnosis of various disease conditions. Recently, language-guided segmentation methods have shown promising results in automating image segmentation where text reports are incorporated as guidance. These text reports, containing image impressions and insights given by clinicians, provides auxiliary guidance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13533v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13533v1-abstract-full" style="display: none;"> Medical image segmentation is crucial in modern medical image analysis, which can aid into diagnosis of various disease conditions. Recently, language-guided segmentation methods have shown promising results in automating image segmentation where text reports are incorporated as guidance. These text reports, containing image impressions and insights given by clinicians, provides auxiliary guidance. However, these methods neglect the inherent pattern gaps between the two distinct modalities, which leads to sub-optimal image-text feature fusion without proper cross-modality feature alignments. Contrastive alignments are widely used to associate image-text semantics in representation learning; however, it has not been exploited to bridge the pattern gaps in language-guided segmentation that relies on subtle low level image details to represent diseases. Existing contrastive alignment methods typically algin high-level global image semantics without involving low-level, localized target information, and therefore fails to explore fine-grained text guidance for language-guided segmentation. In this study, we propose a language-guided segmentation network with Target-informed Multi-level Contrastive Alignments (TMCA). TMCA enables target-informed cross-modality alignments and fine-grained text guidance to bridge the pattern gaps in language-guided segmentation. Specifically, we introduce: 1) a target-sensitive semantic distance module that enables granular image-text alignment modelling, and 2) a multi-level alignment strategy that directs text guidance on low-level image features. In addition, a language-guided target enhancement module is proposed to leverage the aligned text to redirect attention to focus on critical localized image features. Extensive experiments on 4 image-text datasets, involving 3 medical imaging modalities, demonstrated that our TMCA achieved superior performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13533v1-abstract-full').style.display = 'none'; document.getElementById('2412.13533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09603">arXiv:2412.09603</a> <span> [<a href="https://arxiv.org/pdf/2412.09603">pdf</a>, <a href="https://arxiv.org/format/2412.09603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Do Multimodal Large Language Models See Like Humans? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiaying Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuquan Ye</a>, <a href="/search/cs?searchtype=author&query=Lau%2C+R+W+H">Rynson W. H. Lau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09603v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have achieved impressive results on various vision tasks, leveraging recent advancements in large language models. However, a critical question remains unaddressed: do MLLMs perceive visual information similarly to humans? Current benchmarks lack the ability to evaluate MLLMs from this perspective. To address this challenge, we introduce HVSBench, a large-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09603v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09603v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have achieved impressive results on various vision tasks, leveraging recent advancements in large language models. However, a critical question remains unaddressed: do MLLMs perceive visual information similarly to humans? Current benchmarks lack the ability to evaluate MLLMs from this perspective. To address this challenge, we introduce HVSBench, a large-scale benchmark designed to assess the alignment between MLLMs and the human visual system (HVS) on fundamental vision tasks that mirror human vision. HVSBench curated over 85K multimodal samples, spanning 13 categories and 5 fields in HVS, including Prominence, Subitizing, Prioritizing, Free-Viewing, and Searching. Extensive experiments demonstrate the effectiveness of our benchmark in providing a comprehensive evaluation of MLLMs. Specifically, we evaluate 13 MLLMs, revealing that even the best models show significant room for improvement, with most achieving only moderate results. Our experiments reveal that HVSBench presents a new and significant challenge for cutting-edge MLLMs. We believe that HVSBench will facilitate research on human-aligned and explainable MLLMs, marking a key step in understanding how MLLMs perceive and process visual information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09603v1-abstract-full').style.display = 'none'; document.getElementById('2412.09603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jiaying.link/HVSBench/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05271">arXiv:2412.05271</a> <span> [<a href="https://arxiv.org/pdf/2412.05271">pdf</a>, <a href="https://arxiv.org/format/2412.05271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yue Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yangzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lixin Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yimin Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiapeng Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Han Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05271v4-abstract-short" style="display: inline;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'inline'; document.getElementById('2412.05271v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05271v4-abstract-full" style="display: none;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision encoders, language models, dataset sizes, and test-time configurations. Through extensive evaluations on a wide range of benchmarks, including multi-discipline reasoning, document understanding, multi-image / video understanding, real-world comprehension, multimodal hallucination detection, visual grounding, multilingual capabilities, and pure language processing, InternVL 2.5 exhibits competitive performance, rivaling leading commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a 3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing strong potential for test-time scaling. We hope this model contributes to the open-source community by setting new standards for developing and applying multimodal AI systems. HuggingFace demo see https://huggingface.co/spaces/OpenGVLab/InternVL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'none'; document.getElementById('2412.05271v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17257">arXiv:2411.17257</a> <span> [<a href="https://arxiv.org/pdf/2411.17257">pdf</a>, <a href="https://arxiv.org/format/2411.17257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Disentangled Interpretable Representation for Efficient Long-term Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiadong Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenrong Ye</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fuxin Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieying Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaofeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17257v1-abstract-short" style="display: inline;"> Industry 5.0 introduces new challenges for Long-term Time Series Forecasting (LTSF), characterized by high-dimensional, high-resolution data and high-stakes application scenarios. Against this backdrop, developing efficient and interpretable models for LTSF becomes a key challenge. Existing deep learning and linear models often suffer from excessive parameter complexity and lack intuitive interpre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17257v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17257v1-abstract-full" style="display: none;"> Industry 5.0 introduces new challenges for Long-term Time Series Forecasting (LTSF), characterized by high-dimensional, high-resolution data and high-stakes application scenarios. Against this backdrop, developing efficient and interpretable models for LTSF becomes a key challenge. Existing deep learning and linear models often suffer from excessive parameter complexity and lack intuitive interpretability. To address these issues, we propose DiPE-Linear, a Disentangled interpretable Parameter-Efficient Linear network. DiPE-Linear incorporates three temporal components: Static Frequential Attention (SFA), Static Temporal Attention (STA), and Independent Frequential Mapping (IFM). These components alternate between learning in the frequency and time domains to achieve disentangled interpretability. The decomposed model structure reduces parameter complexity from quadratic in fully connected networks (FCs) to linear and computational complexity from quadratic to log-linear. Additionally, a Low-Rank Weight Sharing policy enhances the model's ability to handle multivariate series. Despite operating within a subspace of FCs with limited expressive capacity, DiPE-Linear demonstrates comparable or superior performance to both FCs and nonlinear models across multiple open-source and real-world LTSF datasets, validating the effectiveness of its sophisticatedly designed structure. The combination of efficiency, accuracy, and interpretability makes DiPE-Linear a strong candidate for advancing LTSF in both research and real-world applications. The source code is available at https://github.com/wintertee/DiPE-Linear. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17257v1-abstract-full').style.display = 'none'; document.getElementById('2411.17257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work is submitted to IEEE International Conference on Data Engineering (ICDE) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04397">arXiv:2411.04397</a> <span> [<a href="https://arxiv.org/pdf/2411.04397">pdf</a>, <a href="https://arxiv.org/format/2411.04397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Bayesian Mixture Model of Temporal Point Processes with Determinantal Point Process Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiwei Dong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shaoxin Ye</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuwen Cao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Q">Qiyu Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongteng Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanfang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04397v1-abstract-short" style="display: inline;"> Asynchronous event sequence clustering aims to group similar event sequences in an unsupervised manner. Mixture models of temporal point processes have been proposed to solve this problem, but they often suffer from overfitting, leading to excessive cluster generation with a lack of diversity. To overcome these limitations, we propose a Bayesian mixture model of Temporal Point Processes with Deter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04397v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04397v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04397v1-abstract-full" style="display: none;"> Asynchronous event sequence clustering aims to group similar event sequences in an unsupervised manner. Mixture models of temporal point processes have been proposed to solve this problem, but they often suffer from overfitting, leading to excessive cluster generation with a lack of diversity. To overcome these limitations, we propose a Bayesian mixture model of Temporal Point Processes with Determinantal Point Process prior (TP$^2$DP$^2$) and accordingly an efficient posterior inference algorithm based on conditional Gibbs sampling. Our work provides a flexible learning framework for event sequence clustering, enabling automatic identification of the potential number of clusters and accurate grouping of sequences with similar features. It is applicable to a wide range of parametric temporal point processes, including neural network-based models. Experimental results on both synthetic and real-world data suggest that our framework could produce moderately fewer yet more diverse mixture components, and achieve outstanding results across multiple evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04397v1-abstract-full').style.display = 'none'; document.getElementById('2411.04397v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02093">arXiv:2411.02093</a> <span> [<a href="https://arxiv.org/pdf/2411.02093">pdf</a>, <a href="https://arxiv.org/format/2411.02093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Do Advanced Language Models Eliminate the Need for Prompt Engineering in Software Engineering? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zeyu Sun</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhihao Gong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sixiang Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizhou Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yifan Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Q">Qingyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+D">Dan Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02093v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have significantly advanced software engineering (SE) tasks, with prompt engineering techniques enhancing their performance in code-related areas. However, the rapid development of foundational LLMs such as the non-reasoning model GPT-4o and the reasoning model o1 raises questions about the continued effectiveness of these prompt engineering techniques. This paper pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02093v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02093v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have significantly advanced software engineering (SE) tasks, with prompt engineering techniques enhancing their performance in code-related areas. However, the rapid development of foundational LLMs such as the non-reasoning model GPT-4o and the reasoning model o1 raises questions about the continued effectiveness of these prompt engineering techniques. This paper presents an extensive empirical study that reevaluates various prompt engineering techniques within the context of these advanced LLMs. Focusing on three representative SE tasks, i.e., code generation, code translation, and code summarization, we assess whether prompt engineering techniques still yield improvements with advanced models, the actual effectiveness of reasoning models compared to non-reasoning models, and whether the benefits of using these advanced models justify their increased costs. Our findings reveal that prompt engineering techniques developed for earlier LLMs may provide diminished benefits or even hinder performance when applied to advanced models. In reasoning LLMs, the ability of sophisticated built-in reasoning reduces the impact of complex prompts, sometimes making simple zero-shot prompting more effective. Furthermore, while reasoning models outperform non-reasoning models in tasks requiring complex reasoning, they offer minimal advantages in tasks that do not need reasoning and may incur unnecessary costs. Based on our study, we provide practical guidance for practitioners on selecting appropriate prompt engineering techniques and foundational LLMs, considering factors such as task requirements, operational costs, and environmental impact. Our work contributes to a deeper understanding of effectively harnessing advanced LLMs in SE tasks, informing future research and application development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02093v1-abstract-full').style.display = 'none'; document.getElementById('2411.02093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16261">arXiv:2410.16261</a> <span> [<a href="https://arxiv.org/pdf/2410.16261">pdf</a>, <a href="https://arxiv.org/format/2410.16261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5% Parameters and 90% Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yiming Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lewei Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tong Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16261v3-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in vision-language tasks across a broad spectrum of domains. However, the large model scale and associated high computational costs pose significant challenges for training and deploying MLLMs on consumer-grade GPUs or edge devices, thereby hindering their widespread application. In this work, we introduce Mini-Inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16261v3-abstract-full').style.display = 'inline'; document.getElementById('2410.16261v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16261v3-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in vision-language tasks across a broad spectrum of domains. However, the large model scale and associated high computational costs pose significant challenges for training and deploying MLLMs on consumer-grade GPUs or edge devices, thereby hindering their widespread application. In this work, we introduce Mini-InternVL, a series of MLLMs with parameters ranging from 1B to 4B, which achieves 90% of the performance with only 5% of the parameters. This significant improvement in efficiency and effectiveness makes our models more accessible and applicable in various real-world scenarios. To further promote the adoption of our models, we develop a unified adaptation framework for Mini-InternVL, which enables our models to transfer and outperform specialized models in downstream tasks, including autonomous driving, medical images, and remote sensing. We believe that our study can provide valuable insights and resources to advance the development of efficient and effective MLLMs. Code is available at https://github.com/OpenGVLab/InternVL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16261v3-abstract-full').style.display = 'none'; document.getElementById('2410.16261v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13681">arXiv:2410.13681</a> <span> [<a href="https://arxiv.org/pdf/2410.13681">pdf</a>, <a href="https://arxiv.org/format/2410.13681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Ab initio nonparametric variable selection for scalable Symbolic Regression with large $p$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengbin Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13681v1-abstract-short" style="display: inline;"> Symbolic regression (SR) is a powerful technique for discovering symbolic expressions that characterize nonlinear relationships in data, gaining increasing attention for its interpretability, compactness, and robustness. However, existing SR methods do not scale to datasets with a large number of input variables (referred to as extreme-scale SR), which are common in modern scientific applications.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13681v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13681v1-abstract-full" style="display: none;"> Symbolic regression (SR) is a powerful technique for discovering symbolic expressions that characterize nonlinear relationships in data, gaining increasing attention for its interpretability, compactness, and robustness. However, existing SR methods do not scale to datasets with a large number of input variables (referred to as extreme-scale SR), which are common in modern scientific applications. This ``large $p$'' setting, often accompanied by measurement error, leads to slow performance of SR methods and overly complex expressions that are difficult to interpret. To address this scalability challenge, we propose a method called PAN+SR, which combines a key idea of ab initio nonparametric variable selection with SR to efficiently pre-screen large input spaces and reduce search complexity while maintaining accuracy. The use of nonparametric methods eliminates model misspecification, supporting a strategy called parametric-assisted nonparametric (PAN). We also extend SRBench, an open-source benchmarking platform, by incorporating high-dimensional regression problems with various signal-to-noise ratios. Our results demonstrate that PAN+SR consistently enhances the performance of 17 contemporary SR methods, enabling several to achieve state-of-the-art performance on these challenging datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13681v1-abstract-full').style.display = 'none'; document.getElementById('2410.13681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11758">arXiv:2410.11758</a> <span> [<a href="https://arxiv.org/pdf/2410.11758">pdf</a>, <a href="https://arxiv.org/format/2410.11758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Latent Action Pretraining from Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+B">Byeongguk Jeon</a>, <a href="/search/cs?searchtype=author&query=Joo%2C+S">Sejune Joo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Baolin Peng</a>, <a href="/search/cs?searchtype=author&query=Mandlekar%2C+A">Ajay Mandlekar</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+R">Reuben Tan</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+Y">Yu-Wei Chao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Liden%2C+L">Lars Liden</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11758v1-abstract-short" style="display: inline;"> We introduce Latent Action Pretraining for general Action models (LAPA), an unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11758v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11758v1-abstract-full" style="display: none;"> We introduce Latent Action Pretraining for general Action models (LAPA), an unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a method to learn from internet-scale videos that do not have robot action labels. We first train an action quantization model leveraging VQ-VAE-based objective to learn discrete latent actions between image frames, then pretrain a latent VLA model to predict these latent actions from observations and task descriptions, and finally finetune the VLA on small-scale robot manipulation data to map from latent to robot actions. Experimental results demonstrate that our method significantly outperforms existing techniques that train robot manipulation policies from large-scale videos. Furthermore, it outperforms the state-of-the-art VLA model trained with robotic action labels on real-world manipulation tasks that require language conditioning, generalization to unseen objects, and semantic generalization to unseen instructions. Training only on human manipulation videos also shows positive transfer, opening up the potential for leveraging web-scale data for robotics foundation model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11758v1-abstract-full').style.display = 'none'; document.getElementById('2410.11758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://latentactionpretraining.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07002">arXiv:2410.07002</a> <span> [<a href="https://arxiv.org/pdf/2410.07002">pdf</a>, <a href="https://arxiv.org/format/2410.07002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CursorCore: Assist Programming through Aligning Anything </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyu Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shijin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07002v2-abstract-short" style="display: inline;"> Large language models have been successfully applied to programming assistance tasks, such as code completion, code insertion, and instructional code editing. However, these applications remain insufficiently automated and struggle to effectively integrate various types of information during the programming process, including coding history, current code, and user instructions. In this work, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07002v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07002v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07002v2-abstract-full" style="display: none;"> Large language models have been successfully applied to programming assistance tasks, such as code completion, code insertion, and instructional code editing. However, these applications remain insufficiently automated and struggle to effectively integrate various types of information during the programming process, including coding history, current code, and user instructions. In this work, we propose a new conversational framework that comprehensively integrates these information sources, collect data to train our models and evaluate their performance. Firstly, to thoroughly evaluate how well models align with different types of information and the quality of their outputs, we introduce a new benchmark, APEval (Assist Programming Eval), to comprehensively assess the performance of models in programming assistance tasks. Then, for data collection, we develop a data generation pipeline, Programming-Instruct, which synthesizes training data from diverse sources, such as GitHub and online judge platforms. This pipeline can automatically generate various types of messages throughout the programming process. Finally, using this pipeline, we generate 219K samples, fine-tune multiple models, and develop the CursorCore series. We show that CursorCore outperforms other models of comparable size. This framework unifies applications such as inline chat and automated editing, contributes to the advancement of coding assistants. Code, models and data are freely available at https://github.com/TechxGenus/CursorCore. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07002v2-abstract-full').style.display = 'none'; document.getElementById('2410.07002v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01638">arXiv:2410.01638</a> <span> [<a href="https://arxiv.org/pdf/2410.01638">pdf</a>, <a href="https://arxiv.org/format/2410.01638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Data Extrapolation for Text-to-image Generation on Small Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Senmao Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01638v1-abstract-short" style="display: inline;"> Text-to-image generation requires large amount of training data to synthesizing high-quality images. For augmenting training data, previous methods rely on data interpolations like cropping, flipping, and mixing up, which fail to introduce new information and yield only marginal improvements. In this paper, we propose a new data augmentation method for text-to-image generation using linear extrapo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01638v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01638v1-abstract-full" style="display: none;"> Text-to-image generation requires large amount of training data to synthesizing high-quality images. For augmenting training data, previous methods rely on data interpolations like cropping, flipping, and mixing up, which fail to introduce new information and yield only marginal improvements. In this paper, we propose a new data augmentation method for text-to-image generation using linear extrapolation. Specifically, we apply linear extrapolation only on text feature, and new image data are retrieved from the internet by search engines. For the reliability of new text-image pairs, we design two outlier detectors to purify retrieved images. Based on extrapolation, we construct training samples dozens of times larger than the original dataset, resulting in a significant improvement in text-to-image performance. Moreover, we propose a NULL-guidance to refine score estimation, and apply recurrent affine transformation to fuse text information. Our model achieves FID scores of 7.91, 9.52 and 5.00 on the CUB, Oxford and COCO datasets. The code and data will be available on GitHub (https://github.com/senmaoy/RAT-Diffusion). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01638v1-abstract-full').style.display = 'none'; document.getElementById('2410.01638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17066">arXiv:2409.17066</a> <span> [<a href="https://arxiv.org/pdf/2409.17066">pdf</a>, <a href="https://arxiv.org/format/2409.17066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VPTQ: Extreme Low-bit Vector Post-Training Quantization for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifei Liu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jicheng Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyu Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+L">Li Lyna Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Ting Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17066v2-abstract-short" style="display: inline;"> Scaling model size significantly challenges the deployment and inference of Large Language Models (LLMs). Due to the redundancy in LLM weights, recent research has focused on pushing weight-only quantization to extremely low-bit (even down to 2 bits). It reduces memory requirements, optimizes storage costs, and decreases memory bandwidth needs during inference. However, due to numerical representa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17066v2-abstract-full').style.display = 'inline'; document.getElementById('2409.17066v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17066v2-abstract-full" style="display: none;"> Scaling model size significantly challenges the deployment and inference of Large Language Models (LLMs). Due to the redundancy in LLM weights, recent research has focused on pushing weight-only quantization to extremely low-bit (even down to 2 bits). It reduces memory requirements, optimizes storage costs, and decreases memory bandwidth needs during inference. However, due to numerical representation limitations, traditional scalar-based weight quantization struggles to achieve such extreme low-bit. Recent research on Vector Quantization (VQ) for LLMs has demonstrated the potential for extremely low-bit model quantization by compressing vectors into indices using lookup tables. In this paper, we introduce Vector Post-Training Quantization (VPTQ) for extremely low-bit quantization of LLMs. We use Second-Order Optimization to formulate the LLM VQ problem and guide our quantization algorithm design by solving the optimization. We further refine the weights using Channel-Independent Second-Order Optimization for a granular VQ. In addition, by decomposing the optimization problem, we propose a brief and effective codebook initialization algorithm. We also extend VPTQ to support residual and outlier quantization, which enhances model accuracy and further compresses the model. Our experimental results show that VPTQ reduces model quantization perplexity by $0.01$-$0.34$ on LLaMA-2, $0.38$-$0.68$ on Mistral-7B, $4.41$-$7.34$ on LLaMA-3 over SOTA at 2-bit, with an average accuracy improvement of $0.79$-$1.5\%$ on LLaMA-2, $1\%$ on Mistral-7B, $11$-$22\%$ on LLaMA-3 on QA tasks on average. We only utilize $10.4$-$18.6\%$ of the quantization algorithm execution time, resulting in a $1.6$-$1.8\times$ increase in inference throughput compared to SOTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17066v2-abstract-full').style.display = 'none'; document.getElementById('2409.17066v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024, Main, Poster</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15528">arXiv:2409.15528</a> <span> [<a href="https://arxiv.org/pdf/2409.15528">pdf</a>, <a href="https://arxiv.org/format/2409.15528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Diverse Robot Striking Motions with Diffusion Models and Kinematically Constrained Gradient Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+K+M">Kin Man Lee</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sean Ye</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qingyu Xiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zixuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zaidi%2C+Z">Zulfiqar Zaidi</a>, <a href="/search/cs?searchtype=author&query=D%27Ambrosio%2C+D+B">David B. D'Ambrosio</a>, <a href="/search/cs?searchtype=author&query=Sanketi%2C+P+R">Pannag R. Sanketi</a>, <a href="/search/cs?searchtype=author&query=Gombolay%2C+M">Matthew Gombolay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15528v1-abstract-short" style="display: inline;"> Advances in robot learning have enabled robots to generate skills for a variety of tasks. Yet, robot learning is typically sample inefficient, struggles to learn from data sources exhibiting varied behaviors, and does not naturally incorporate constraints. These properties are critical for fast, agile tasks such as playing table tennis. Modern techniques for learning from demonstration improve sam… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15528v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15528v1-abstract-full" style="display: none;"> Advances in robot learning have enabled robots to generate skills for a variety of tasks. Yet, robot learning is typically sample inefficient, struggles to learn from data sources exhibiting varied behaviors, and does not naturally incorporate constraints. These properties are critical for fast, agile tasks such as playing table tennis. Modern techniques for learning from demonstration improve sample efficiency and scale to diverse data, but are rarely evaluated on agile tasks. In the case of reinforcement learning, achieving good performance requires training on high-fidelity simulators. To overcome these limitations, we develop a novel diffusion modeling approach that is offline, constraint-guided, and expressive of diverse agile behaviors. The key to our approach is a kinematic constraint gradient guidance (KCGG) technique that computes gradients through both the forward kinematics of the robot arm and the diffusion model to direct the sampling process. KCGG minimizes the cost of violating constraints while simultaneously keeping the sampled trajectory in-distribution of the training data. We demonstrate the effectiveness of our approach for time-critical robotic tasks by evaluating KCGG in two challenging domains: simulated air hockey and real table tennis. In simulated air hockey, we achieved a 25.4% increase in block rate, while in table tennis, we saw a 17.3% increase in success rate compared to imitation learning baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15528v1-abstract-full').style.display = 'none'; document.getElementById('2409.15528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05474">arXiv:2409.05474</a> <span> [<a href="https://arxiv.org/pdf/2409.05474">pdf</a>, <a href="https://arxiv.org/format/2409.05474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PVP-Recon: Progressive View Planning via Warping Consistency for Sparse-View Surface Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sheng Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuze He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Matthieu Lin</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+J">Jenny Sheng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Ruoyu Fan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yiheng Han</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yubin Hu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yu-Hui Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong-Jin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05474v1-abstract-short" style="display: inline;"> Neural implicit representations have revolutionized dense multi-view surface reconstruction, yet their performance significantly diminishes with sparse input views. A few pioneering works have sought to tackle the challenge of sparse-view reconstruction by leveraging additional geometric priors or multi-scene generalizability. However, they are still hindered by the imperfect choice of input views… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05474v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05474v1-abstract-full" style="display: none;"> Neural implicit representations have revolutionized dense multi-view surface reconstruction, yet their performance significantly diminishes with sparse input views. A few pioneering works have sought to tackle the challenge of sparse-view reconstruction by leveraging additional geometric priors or multi-scene generalizability. However, they are still hindered by the imperfect choice of input views, using images under empirically determined viewpoints to provide considerable overlap. We propose PVP-Recon, a novel and effective sparse-view surface reconstruction method that progressively plans the next best views to form an optimal set of sparse viewpoints for image capturing. PVP-Recon starts initial surface reconstruction with as few as 3 views and progressively adds new views which are determined based on a novel warping score that reflects the information gain of each newly added view. This progressive view planning progress is interleaved with a neural SDF-based reconstruction module that utilizes multi-resolution hash features, enhanced by a progressive training scheme and a directional Hessian loss. Quantitative and qualitative experiments on three benchmark datasets show that our framework achieves high-quality reconstruction with a constrained input budget and outperforms existing baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05474v1-abstract-full').style.display = 'none'; document.getElementById('2409.05474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04758">arXiv:2409.04758</a> <span> [<a href="https://arxiv.org/pdf/2409.04758">pdf</a>, <a href="https://arxiv.org/format/2409.04758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-72111-3_23">10.1007/978-3-031-72111-3_23 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SGSeg: Enabling Text-free Inference in Language-guided Segmentation of Chest X-rays via Self-guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuchang Ye</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+M">Mingyuan Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjian Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Dagan Feng</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinman Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04758v1-abstract-short" style="display: inline;"> Segmentation of infected areas in chest X-rays is pivotal for facilitating the accurate delineation of pulmonary structures and pathological anomalies. Recently, multi-modal language-guided image segmentation methods have emerged as a promising solution for chest X-rays where the clinical text reports, depicting the assessment of the images, are used as guidance. Nevertheless, existing language-gu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04758v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04758v1-abstract-full" style="display: none;"> Segmentation of infected areas in chest X-rays is pivotal for facilitating the accurate delineation of pulmonary structures and pathological anomalies. Recently, multi-modal language-guided image segmentation methods have emerged as a promising solution for chest X-rays where the clinical text reports, depicting the assessment of the images, are used as guidance. Nevertheless, existing language-guided methods require clinical reports alongside the images, and hence, they are not applicable for use in image segmentation in a decision support context, but rather limited to retrospective image analysis after clinical reporting has been completed. In this study, we propose a self-guided segmentation framework (SGSeg) that leverages language guidance for training (multi-modal) while enabling text-free inference (uni-modal), which is the first that enables text-free inference in language-guided segmentation. We exploit the critical location information of both pulmonary and pathological structures depicted in the text reports and introduce a novel localization-enhanced report generation (LERG) module to generate clinical reports for self-guidance. Our LERG integrates an object detector and a location-based attention aggregator, weakly-supervised by a location-aware pseudo-label extraction module. Extensive experiments on a well-benchmarked QaTa-COV19 dataset demonstrate that our SGSeg achieved superior performance than existing uni-modal segmentation methods and closely matched the state-of-the-art performance of multi-modal language-guided segmentation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04758v1-abstract-full').style.display = 'none'; document.getElementById('2409.04758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This preprint has not undergone peer review or any post-submission improvments or corrections</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14158">arXiv:2408.14158</a> <span> [<a href="https://arxiv.org/pdf/2408.14158">pdf</a>, <a href="https://arxiv.org/format/2408.14158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fire-Flyer AI-HPC: A Cost-Effective Software-Hardware Co-Design for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+W">Wei An</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shanhuang Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Honghui Ding</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kai Dong</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qiushi Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wenjun Gao</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+K">Kang Guan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jianzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yongqiang Guo</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhe Fu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Ying He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Panpan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiashi Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wenfeng Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shanghao Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuan Lu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+X">Xiaotao Nie</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+T">Tian Pei</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14158v2-abstract-short" style="display: inline;"> The rapid progress in Deep Learning (DL) and Large Language Models (LLMs) has exponentially increased demands of computational power and bandwidth. This, combined with the high costs of faster computing chips and interconnects, has significantly inflated High Performance Computing (HPC) construction costs. To address these challenges, we introduce the Fire-Flyer AI-HPC architecture, a synergistic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14158v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14158v2-abstract-full" style="display: none;"> The rapid progress in Deep Learning (DL) and Large Language Models (LLMs) has exponentially increased demands of computational power and bandwidth. This, combined with the high costs of faster computing chips and interconnects, has significantly inflated High Performance Computing (HPC) construction costs. To address these challenges, we introduce the Fire-Flyer AI-HPC architecture, a synergistic hardware-software co-design framework and its best practices. For DL training, we deployed the Fire-Flyer 2 with 10,000 PCIe A100 GPUs, achieved performance approximating the DGX-A100 while reducing costs by half and energy consumption by 40%. We specifically engineered HFReduce to accelerate allreduce communication and implemented numerous measures to keep our Computation-Storage Integrated Network congestion-free. Through our software stack, including HaiScale, 3FS, and HAI-Platform, we achieved substantial scalability by overlapping computation and communication. Our system-oriented experience from DL training provides valuable insights to drive future advancements in AI-HPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14158v2-abstract-full').style.display = 'none'; document.getElementById('2408.14158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is the preprint version of the paper accepted for presentation at the 2024 International Conference for High Performance Computing, Networking, Storage, and Analysis (SC'24). \c{opyright} 2024 IEEE. Personal use of this material is permitted. For other uses, permission from IEEE must be obtained. Please refer to IEEE Xplore for the final published version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12574">arXiv:2408.12574</a> <span> [<a href="https://arxiv.org/pdf/2408.12574">pdf</a>, <a href="https://arxiv.org/format/2408.12574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MuMA-ToM: Multi-modal Multi-Agent Theory of Mind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haojun Shi</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Suyu Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Isik%2C+L">Leyla Isik</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12574v4-abstract-short" style="display: inline;"> Understanding people's social interactions in complex real-world scenarios often relies on intricate mental reasoning. To truly understand how and why people interact with one another, we must infer the underlying mental states that give rise to the social interactions, i.e., Theory of Mind reasoning in multi-agent interactions. Additionally, social interactions are often multi-modal -- we can wat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12574v4-abstract-full').style.display = 'inline'; document.getElementById('2408.12574v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12574v4-abstract-full" style="display: none;"> Understanding people's social interactions in complex real-world scenarios often relies on intricate mental reasoning. To truly understand how and why people interact with one another, we must infer the underlying mental states that give rise to the social interactions, i.e., Theory of Mind reasoning in multi-agent interactions. Additionally, social interactions are often multi-modal -- we can watch people's actions, hear their conversations, and/or read about their past behaviors. For AI systems to successfully and safely interact with people in real-world environments, they also need to understand people's mental states as well as their inferences about each other's mental states based on multi-modal information about their interactions. For this, we introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide video and text descriptions of people's multi-modal behavior in realistic household environments. Based on the context, we then ask questions about people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM in a human experiment and provided a human baseline. We also proposed a novel multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse Multi-agent Planning). Our experimental results show that LIMP significantly outperforms state-of-the-art methods, including large multi-modal models (e.g., GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12574v4-abstract-full').style.display = 'none'; document.getElementById('2408.12574v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI-25 (Oral). Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: https://github.com/SCAI-JHU/MuMA-ToM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11030">arXiv:2408.11030</a> <span> [<a href="https://arxiv.org/pdf/2408.11030">pdf</a>, <a href="https://arxiv.org/format/2408.11030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OpenScan: A Benchmark for Generalized Open-Vocabulary 3D Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Youjun Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiaying Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuquan Ye</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Q">Qianshi Pang</a>, <a href="/search/cs?searchtype=author&query=Lau%2C+R+W+H">Rynson W. H. Lau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11030v1-abstract-short" style="display: inline;"> Open-vocabulary 3D scene understanding (OV-3D) aims to localize and classify novel objects beyond the closed object classes. However, existing approaches and benchmarks primarily focus on the open vocabulary problem within the context of object classes, which is insufficient to provide a holistic evaluation to what extent a model understands the 3D scene. In this paper, we introduce a more challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11030v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11030v1-abstract-full" style="display: none;"> Open-vocabulary 3D scene understanding (OV-3D) aims to localize and classify novel objects beyond the closed object classes. However, existing approaches and benchmarks primarily focus on the open vocabulary problem within the context of object classes, which is insufficient to provide a holistic evaluation to what extent a model understands the 3D scene. In this paper, we introduce a more challenging task called Generalized Open-Vocabulary 3D Scene Understanding (GOV-3D) to explore the open vocabulary problem beyond object classes. It encompasses an open and diverse set of generalized knowledge, expressed as linguistic queries of fine-grained and object-specific attributes. To this end, we contribute a new benchmark named OpenScan, which consists of 3D object attributes across eight representative linguistic aspects, including affordance, property, material, and more. We further evaluate state-of-the-art OV-3D methods on our OpenScan benchmark, and discover that these methods struggle to comprehend the abstract vocabularies of the GOV-3D task, a challenge that cannot be addressed by simply scaling up object classes during training. We highlight the limitations of existing methodologies and explore a promising direction to overcome the identified shortcomings. Data and code are available at https://github.com/YoujunZhao/OpenScan <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11030v1-abstract-full').style.display = 'none'; document.getElementById('2408.11030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10746">arXiv:2408.10746</a> <span> [<a href="https://arxiv.org/pdf/2408.10746">pdf</a>, <a href="https://arxiv.org/format/2408.10746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Pluto and Charon: A Time and Memory Efficient Collaborative Edge AI Framework for Personal LLMs Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ouyang%2C+B">Bei Ouyang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyuan Ye</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Liekang Zeng</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+T">Tianyi Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingyi Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10746v1-abstract-short" style="display: inline;"> Large language models (LLMs) have unlocked a plethora of powerful applications at the network edge, such as intelligent personal assistants. Data privacy and security concerns have prompted a shift towards edge-based fine-tuning of personal LLMs, away from cloud reliance. However, this raises issues of computational intensity and resource scarcity, hindering training efficiency and feasibility. Wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10746v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10746v1-abstract-full" style="display: none;"> Large language models (LLMs) have unlocked a plethora of powerful applications at the network edge, such as intelligent personal assistants. Data privacy and security concerns have prompted a shift towards edge-based fine-tuning of personal LLMs, away from cloud reliance. However, this raises issues of computational intensity and resource scarcity, hindering training efficiency and feasibility. While current studies investigate parameter-efficient fine-tuning (PEFT) techniques to mitigate resource constraints, our analysis indicates that these techniques are not sufficiently resource-efficient for edge devices. To tackle these challenges, we propose Pluto and Charon (PAC), a time and memory efficient collaborative edge AI framework for personal LLMs fine-tuning. PAC breaks the resource wall of personal LLMs fine-tuning with a sophisticated algorithm-system co-design. (1) Algorithmically, PAC implements a personal LLMs fine-tuning technique that is efficient in terms of parameters, time, and memory. It utilizes Parallel Adapters to circumvent the need for a full backward pass through the LLM backbone. Additionally, an activation cache mechanism further streamlining the process by negating the necessity for repeated forward passes across multiple epochs. (2) Systematically, PAC leverages edge devices in close proximity, pooling them as a collective resource for in-situ personal LLMs fine-tuning, utilizing a hybrid data and pipeline parallelism to orchestrate distributed training. The use of the activation cache eliminates the need for forward pass through the LLM backbone,enabling exclusive fine-tuning of the Parallel Adapters using data parallelism. Extensive evaluation based on prototype implementation demonstrates that PAC remarkably outperforms state-of-the-art approaches, achieving up to 8.64x end-to-end speedup and up to 88.16% reduction in memory footprint. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10746v1-abstract-full').style.display = 'none'; document.getElementById('2408.10746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The 53rd International Conference on Parallel Processing (ICPP'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09130">arXiv:2408.09130</a> <span> [<a href="https://arxiv.org/pdf/2408.09130">pdf</a>, <a href="https://arxiv.org/format/2408.09130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gaussian in the Dark: Real-Time View Synthesis From Inconsistent Dark Images Using Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sheng Ye</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhen-Hui Dong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yubin Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yu-Hui Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong-Jin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09130v2-abstract-short" style="display: inline;"> 3D Gaussian Splatting has recently emerged as a powerful representation that can synthesize remarkable novel views using consistent multi-view images as input. However, we notice that images captured in dark environments where the scenes are not fully illuminated can exhibit considerable brightness variations and multi-view inconsistency, which poses great challenges to 3D Gaussian Splatting and s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09130v2-abstract-full').style.display = 'inline'; document.getElementById('2408.09130v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09130v2-abstract-full" style="display: none;"> 3D Gaussian Splatting has recently emerged as a powerful representation that can synthesize remarkable novel views using consistent multi-view images as input. However, we notice that images captured in dark environments where the scenes are not fully illuminated can exhibit considerable brightness variations and multi-view inconsistency, which poses great challenges to 3D Gaussian Splatting and severely degrades its performance. To tackle this problem, we propose Gaussian-DK. Observing that inconsistencies are mainly caused by camera imaging, we represent a consistent radiance field of the physical world using a set of anisotropic 3D Gaussians, and design a camera response module to compensate for multi-view inconsistencies. We also introduce a step-based gradient scaling strategy to constrain Gaussians near the camera, which turn out to be floaters, from splitting and cloning. Experiments on our proposed benchmark dataset demonstrate that Gaussian-DK produces high-quality renderings without ghosting and floater artifacts and significantly outperforms existing methods. Furthermore, we can also synthesize light-up images by controlling exposure levels that clearly show details in shadow areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09130v2-abstract-full').style.display = 'none'; document.getElementById('2408.09130v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by PG 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08015">arXiv:2408.08015</a> <span> [<a href="https://arxiv.org/pdf/2408.08015">pdf</a>, <a href="https://arxiv.org/format/2408.08015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for Collaborative DNN Training on Heterogeneous Edge Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyuan Ye</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Liekang Zeng</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+G">Guoliang Xing</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08015v1-abstract-short" style="display: inline;"> On-device Deep Neural Network (DNN) training has been recognized as crucial for privacy-preserving machine learning at the edge. However, the intensive training workload and limited onboard computing resources pose significant challenges to the availability and efficiency of model training. While existing works address these challenges through native resource management optimization, we instead le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08015v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08015v1-abstract-full" style="display: none;"> On-device Deep Neural Network (DNN) training has been recognized as crucial for privacy-preserving machine learning at the edge. However, the intensive training workload and limited onboard computing resources pose significant challenges to the availability and efficiency of model training. While existing works address these challenges through native resource management optimization, we instead leverage our observation that edge environments usually comprise a rich set of accompanying trusted edge devices with idle resources beyond a single terminal. We propose Asteroid, a distributed edge training system that breaks the resource walls across heterogeneous edge devices for efficient model training acceleration. Asteroid adopts a hybrid pipeline parallelism to orchestrate distributed training, along with a judicious parallelism planning for maximizing throughput under certain resource constraints. Furthermore, a fault-tolerant yet lightweight pipeline replay mechanism is developed to tame the device-level dynamics for training robustness and performance stability. We implement Asteroid on heterogeneous edge devices with both vision and language models, demonstrating up to 12.2x faster training than conventional parallelism methods and 2.1x faster than state-of-the-art hybrid parallelism methods through evaluations. Furthermore, Asteroid can recover training pipeline 14x faster than baseline methods while preserving comparable throughput despite unexpected device exiting and failure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08015v1-abstract-full').style.display = 'none'; document.getElementById('2408.08015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The 30th Annual International Conference on Mobile Computing and Networking (MobiCom'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15320">arXiv:2407.15320</a> <span> [<a href="https://arxiv.org/pdf/2407.15320">pdf</a>, <a href="https://arxiv.org/format/2407.15320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Edge Graph Intelligence: Reciprocally Empowering Edge Networks with Graph Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Liekang Zeng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyuan Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Ju Ren</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Xuemin"> Xuemin</a>, <a href="/search/cs?searchtype=author&query=Shen"> Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15320v2-abstract-short" style="display: inline;"> Recent years have witnessed a thriving growth of computing facilities connected at the network edge, cultivating edge networks as a fundamental infrastructure for supporting miscellaneous intelligent services.Meanwhile, Artificial Intelligence (AI) frontiers have extrapolated to the graph domain and promoted Graph Intelligence (GI). Given the inherent relation between graphs and networks, the inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15320v2-abstract-full').style.display = 'inline'; document.getElementById('2407.15320v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15320v2-abstract-full" style="display: none;"> Recent years have witnessed a thriving growth of computing facilities connected at the network edge, cultivating edge networks as a fundamental infrastructure for supporting miscellaneous intelligent services.Meanwhile, Artificial Intelligence (AI) frontiers have extrapolated to the graph domain and promoted Graph Intelligence (GI). Given the inherent relation between graphs and networks, the interdiscipline of graph learning and edge networks, i.e., Edge GI or EGI, has revealed a novel interplay between them -- GI aids in optimizing edge networks, while edge networks facilitate GI model deployment. Driven by this delicate closed-loop, EGI is recognized as a promising solution to fully unleash the potential of edge computing power and is garnering growing attention. Nevertheless, research on EGI remains nascent, and there is a soaring demand within both the communications and AI communities for a dedicated venue to share recent advancements. To this end, this paper promotes the concept of EGI, explores its scope and core principles, and conducts a comprehensive survey concerning recent research efforts on this emerging field. Specifically, this paper introduces and discusses: 1) fundamentals of edge computing and graph learning,2) emerging techniques centering on the closed loop between graph intelligence and edge networks, and 3) open challenges and research opportunities of future EGI. By bridging the gap across communication, networking, and graph learning areas, we believe that this survey can garner increased attention, foster meaningful discussions, and inspire further research ideas in EGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15320v2-abstract-full').style.display = 'none'; document.getElementById('2407.15320v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Communications Surveys & Tutorials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14933">arXiv:2407.14933</a> <span> [<a href="https://arxiv.org/pdf/2407.14933">pdf</a>, <a href="https://arxiv.org/format/2407.14933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Consent in Crisis: The Rapid Decline of the AI Data Commons </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&query=Mahari%2C+R">Robert Mahari</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A">Ariel Lee</a>, <a href="/search/cs?searchtype=author&query=Lund%2C+C">Campbell Lund</a>, <a href="/search/cs?searchtype=author&query=Oderinwale%2C+H">Hamidah Oderinwale</a>, <a href="/search/cs?searchtype=author&query=Brannon%2C+W">William Brannon</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+N">Nayan Saxena</a>, <a href="/search/cs?searchtype=author&query=Obeng-Marnu%2C+N">Naana Obeng-Marnu</a>, <a href="/search/cs?searchtype=author&query=South%2C+T">Tobin South</a>, <a href="/search/cs?searchtype=author&query=Hunter%2C+C">Cole Hunter</a>, <a href="/search/cs?searchtype=author&query=Klyman%2C+K">Kevin Klyman</a>, <a href="/search/cs?searchtype=author&query=Klamm%2C+C">Christopher Klamm</a>, <a href="/search/cs?searchtype=author&query=Schoelkopf%2C+H">Hailey Schoelkopf</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+N">Nikhil Singh</a>, <a href="/search/cs?searchtype=author&query=Cherep%2C+M">Manuel Cherep</a>, <a href="/search/cs?searchtype=author&query=Anis%2C+A">Ahmad Anis</a>, <a href="/search/cs?searchtype=author&query=Dinh%2C+A">An Dinh</a>, <a href="/search/cs?searchtype=author&query=Chitongo%2C+C">Caroline Chitongo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Sileo%2C+D">Damien Sileo</a>, <a href="/search/cs?searchtype=author&query=Mataciunas%2C+D">Deividas Mataciunas</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E">Emad Alghamdi</a>, <a href="/search/cs?searchtype=author&query=Shippole%2C+E">Enrico Shippole</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14933v2-abstract-short" style="display: inline;"> General-purpose artificial intelligence (AI) systems are built on massive swathes of public web data, assembled into corpora such as C4, RefinedWeb, and Dolma. To our knowledge, we conduct the first, large-scale, longitudinal audit of the consent protocols for the web domains underlying AI training corpora. Our audit of 14,000 web domains provides an expansive view of crawlable web data and how co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14933v2-abstract-full').style.display = 'inline'; document.getElementById('2407.14933v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14933v2-abstract-full" style="display: none;"> General-purpose artificial intelligence (AI) systems are built on massive swathes of public web data, assembled into corpora such as C4, RefinedWeb, and Dolma. To our knowledge, we conduct the first, large-scale, longitudinal audit of the consent protocols for the web domains underlying AI training corpora. Our audit of 14,000 web domains provides an expansive view of crawlable web data and how codified data use preferences are changing over time. We observe a proliferation of AI-specific clauses to limit use, acute differences in restrictions on AI developers, as well as general inconsistencies between websites' expressed intentions in their Terms of Service and their robots.txt. We diagnose these as symptoms of ineffective web protocols, not designed to cope with the widespread re-purposing of the internet for AI. Our longitudinal analyses show that in a single year (2023-2024) there has been a rapid crescendo of data restrictions from web sources, rendering ~5%+ of all tokens in C4, or 28%+ of the most actively maintained, critical sources in C4, fully restricted from use. For Terms of Service crawling restrictions, a full 45% of C4 is now restricted. If respected or enforced, these restrictions are rapidly biasing the diversity, freshness, and scaling laws for general-purpose AI systems. We hope to illustrate the emerging crises in data consent, for both developers and creators. The foreclosure of much of the open web will impact not only commercial AI, but also non-commercial AI and academic research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14933v2-abstract-full').style.display = 'none'; document.getElementById('2407.14933v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">41 pages (13 main), 5 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07835">arXiv:2407.07835</a> <span> [<a href="https://arxiv.org/pdf/2407.07835">pdf</a>, <a href="https://arxiv.org/format/2407.07835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoBus: A Multimodal Dataset for Controllable Road Networks and Building Layouts Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruihang Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huangnan Zheng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shanding Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhijie Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07835v1-abstract-short" style="display: inline;"> Automated 3D city generation, focusing on road networks and building layouts, is in high demand for applications in urban design, multimedia games and autonomous driving simulations. The surge of generative AI facilitates designing city layouts based on deep learning models. However, the lack of high-quality datasets and benchmarks hinders the progress of these data-driven methods in generating ro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07835v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07835v1-abstract-full" style="display: none;"> Automated 3D city generation, focusing on road networks and building layouts, is in high demand for applications in urban design, multimedia games and autonomous driving simulations. The surge of generative AI facilitates designing city layouts based on deep learning models. However, the lack of high-quality datasets and benchmarks hinders the progress of these data-driven methods in generating road networks and building layouts. Furthermore, few studies consider urban characteristics, which generally take graphics as analysis objects and are crucial for practical applications, to control the generative process. To alleviate these problems, we introduce a multimodal dataset with accompanying evaluation metrics for controllable generation of Road networks and Building layouts (RoBus), which is the first and largest open-source dataset in city generation so far. RoBus dataset is formatted as images, graphics and texts, with $72,400$ paired samples that cover around $80,000km^2$ globally. We analyze the RoBus dataset statistically and validate the effectiveness against existing road networks and building layouts generation methods. Additionally, we design new baselines that incorporate urban characteristics, such as road orientation and building density, in the process of generating road networks and building layouts using the RoBus dataset, enhancing the practicality of automated urban design. The RoBus dataset and related codes are published at https://github.com/tourlics/RoBus_Dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07835v1-abstract-full').style.display = 'none'; document.getElementById('2407.07835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04029">arXiv:2407.04029</a> <span> [<a href="https://arxiv.org/pdf/2407.04029">pdf</a>, <a href="https://arxiv.org/format/2407.04029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Robust Learning under Hybrid Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yang Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shanshan Ye</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chen Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04029v1-abstract-short" style="display: inline;"> Feature noise and label noise are ubiquitous in practical scenarios, which pose great challenges for training a robust machine learning model. Most previous approaches usually deal with only a single problem of either feature noise or label noise. However, in real-world applications, hybrid noise, which contains both feature noise and label noise, is very common due to the unreliable data collecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04029v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04029v1-abstract-full" style="display: none;"> Feature noise and label noise are ubiquitous in practical scenarios, which pose great challenges for training a robust machine learning model. Most previous approaches usually deal with only a single problem of either feature noise or label noise. However, in real-world applications, hybrid noise, which contains both feature noise and label noise, is very common due to the unreliable data collection and annotation processes. Although some results have been achieved by a few representation learning based attempts, this issue is still far from being addressed with promising performance and guaranteed theoretical analyses. To address the challenge, we propose a novel unified learning framework called "Feature and Label Recovery" (FLR) to combat the hybrid noise from the perspective of data recovery, where we concurrently reconstruct both the feature matrix and the label matrix of input data. Specifically, the clean feature matrix is discovered by the low-rank approximation, and the ground-truth label matrix is embedded based on the recovered features with a nuclear norm regularization. Meanwhile, the feature noise and label noise are characterized by their respective adaptive matrix norms to satisfy the corresponding maximum likelihood. As this framework leads to a non-convex optimization problem, we develop the non-convex Alternating Direction Method of Multipliers (ADMM) with the convergence guarantee to solve our learning objective. We also provide the theoretical analysis to show that the generalization error of FLR can be upper-bounded in the presence of hybrid noise. Experimental results on several typical benchmark datasets clearly demonstrate the superiority of our proposed method over the state-of-the-art robust learning approaches for various noises. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04029v1-abstract-full').style.display = 'none'; document.getElementById('2407.04029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18021">arXiv:2406.18021</a> <span> [<a href="https://arxiv.org/pdf/2406.18021">pdf</a>, <a href="https://arxiv.org/format/2406.18021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SC-MoE: Switch Conformer Mixture of Experts for Unified Streaming and Non-streaming Code-Switching ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuaishuai Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shunfei Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinhui Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinkang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18021v1-abstract-short" style="display: inline;"> In this work, we propose a Switch-Conformer-based MoE system named SC-MoE for unified streaming and non-streaming code-switching (CS) automatic speech recognition (ASR), where we design a streaming MoE layer consisting of three language experts, which correspond to Mandarin, English, and blank, respectively, and equipped with a language identification (LID) network with a Connectionist Temporal Cl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18021v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18021v1-abstract-full" style="display: none;"> In this work, we propose a Switch-Conformer-based MoE system named SC-MoE for unified streaming and non-streaming code-switching (CS) automatic speech recognition (ASR), where we design a streaming MoE layer consisting of three language experts, which correspond to Mandarin, English, and blank, respectively, and equipped with a language identification (LID) network with a Connectionist Temporal Classification (CTC) loss as a router in the encoder of SC-MoE to achieve a real-time streaming CS ASR system. To further utilize the language information embedded in text, we also incorporate MoE layers into the decoder of SC-MoE. In addition, we introduce routers into every MoE layer of the encoder and the decoder and achieve better recognition performance. Experimental results show that the SC-MoE significantly improves CS ASR performances over baseline with comparable computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18021v1-abstract-full').style.display = 'none'; document.getElementById('2406.18021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by InterSpeech 2024; 5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11813">arXiv:2406.11813</a> <span> [<a href="https://arxiv.org/pdf/2406.11813">pdf</a>, <a href="https://arxiv.org/format/2406.11813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How Do Large Language Models Acquire Factual Knowledge During Pretraining? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hoyeon Chang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinho Park</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sohee Yang</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+Y">Youngkyung Seo</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+D">Du-Seong Chang</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11813v3-abstract-short" style="display: inline;"> Despite the recent observation that large language models (LLMs) can store substantial factual knowledge, there is a limited understanding of the mechanisms of how they acquire factual knowledge through pretraining. This work addresses this gap by studying how LLMs acquire factual knowledge during pretraining. The findings reveal several important insights into the dynamics of factual knowledge ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11813v3-abstract-full').style.display = 'inline'; document.getElementById('2406.11813v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11813v3-abstract-full" style="display: none;"> Despite the recent observation that large language models (LLMs) can store substantial factual knowledge, there is a limited understanding of the mechanisms of how they acquire factual knowledge through pretraining. This work addresses this gap by studying how LLMs acquire factual knowledge during pretraining. The findings reveal several important insights into the dynamics of factual knowledge acquisition during pretraining. First, counterintuitively, we observe that pretraining on more data shows no significant improvement in the model's capability to acquire and maintain factual knowledge. Next, there is a power-law relationship between training steps and forgetting of memorization and generalization of factual knowledge, and LLMs trained with duplicated training data exhibit faster forgetting. Third, training LLMs with larger batch sizes can enhance the models' robustness to forgetting. Overall, our observations suggest that factual knowledge acquisition in LLM pretraining occurs by progressively increasing the probability of factual knowledge presented in the pretraining data at each step. However, this increase is diluted by subsequent forgetting. Based on this interpretation, we demonstrate that we can provide plausible explanations for recently observed behaviors of LLMs, such as the poor performance of LLMs on long-tail knowledge and the benefits of deduplicating the pretraining corpus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11813v3-abstract-full').style.display = 'none'; document.getElementById('2406.11813v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08418">arXiv:2406.08418</a> <span> [<a href="https://arxiv.org/pdf/2406.08418">pdf</a>, <a href="https://arxiv.org/format/2406.08418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OmniCorpus: A Unified Multimodal Corpus of 10 Billion-Level Images Interleaved with Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhenjiang Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanzhou Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yinan He</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiashuo Yu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiasheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingjian Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pinlong Cai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Licheng Wen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenxiang Li</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+P">Pei Chu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a> , et al. (15 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08418v3-abstract-short" style="display: inline;"> Image-text interleaved data, consisting of multiple images and texts arranged in a natural document format, aligns with the presentation paradigm of internet data and closely resembles human reading habits. Recent studies have shown that such data aids multimodal in-context learning and maintains the capabilities of large language models during multimodal fine-tuning. However, the limited scale an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08418v3-abstract-full').style.display = 'inline'; document.getElementById('2406.08418v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08418v3-abstract-full" style="display: none;"> Image-text interleaved data, consisting of multiple images and texts arranged in a natural document format, aligns with the presentation paradigm of internet data and closely resembles human reading habits. Recent studies have shown that such data aids multimodal in-context learning and maintains the capabilities of large language models during multimodal fine-tuning. However, the limited scale and diversity of current image-text interleaved data restrict the development of multimodal large language models. In this paper, we introduce OmniCorpus, a 10 billion-scale image-text interleaved dataset. Using an efficient data engine, we filter and extract large-scale high-quality documents, which contain 8.6 billion images and 1,696 billion text tokens. Compared to counterparts (e.g., MMC4, OBELICS), our dataset 1) has 15 times larger scales while maintaining good data quality; 2) features more diverse sources, including both English and non-English websites as well as video-centric websites; 3) is more flexible, easily degradable from an image-text interleaved format to pure text corpus and image-text pairs. Through comprehensive analysis and experiments, we validate the quality, usability, and effectiveness of the proposed dataset. We hope this could provide a solid data foundation for future multimodal model research. Code and data are released at https://github.com/OpenGVLab/OmniCorpus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08418v3-abstract-full').style.display = 'none'; document.getElementById('2406.08418v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05761">arXiv:2406.05761</a> <span> [<a href="https://arxiv.org/pdf/2406.05761">pdf</a>, <a href="https://arxiv.org/format/2406.05761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The BiGGen Bench: A Principled Benchmark for Fine-grained Evaluation of Language Models with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungone Kim</a>, <a href="/search/cs?searchtype=author&query=Suk%2C+J">Juyoung Suk</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+Y">Ji Yong Cho</a>, <a href="/search/cs?searchtype=author&query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chaeeun Kim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+D">Dongkeun Yoon</a>, <a href="/search/cs?searchtype=author&query=Son%2C+G">Guijin Son</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y">Yejin Cho</a>, <a href="/search/cs?searchtype=author&query=Shafayat%2C+S">Sheikh Shafayat</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+J">Jinheon Baek</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S+H">Sue Hyun Park</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+H">Hyeonbin Hwang</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+J">Jinkyung Jo</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hyowon Cho</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+H">Haebin Shin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seongyun Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+H">Hanseok Oh</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+N">Noah Lee</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+N">Namgyu Ho</a>, <a href="/search/cs?searchtype=author&query=Joo%2C+S+J">Se June Joo</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+M">Miyoung Ko</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yoonjoo Lee</a>, <a href="/search/cs?searchtype=author&query=Chae%2C+H">Hyungjoo Chae</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jamin Shin</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a> , et al. (7 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05761v1-abstract-short" style="display: inline;"> As language models (LMs) become capable of handling a wide range of tasks, their evaluation is becoming as challenging as their development. Most generation benchmarks currently assess LMs using abstract evaluation criteria like helpfulness and harmlessness, which often lack the flexibility and granularity of human assessment. Additionally, these benchmarks tend to focus disproportionately on spec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05761v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05761v1-abstract-full" style="display: none;"> As language models (LMs) become capable of handling a wide range of tasks, their evaluation is becoming as challenging as their development. Most generation benchmarks currently assess LMs using abstract evaluation criteria like helpfulness and harmlessness, which often lack the flexibility and granularity of human assessment. Additionally, these benchmarks tend to focus disproportionately on specific capabilities such as instruction following, leading to coverage bias. To overcome these limitations, we introduce the BiGGen Bench, a principled generation benchmark designed to thoroughly evaluate nine distinct capabilities of LMs across 77 diverse tasks. A key feature of the BiGGen Bench is its use of instance-specific evaluation criteria, closely mirroring the nuanced discernment of human evaluation. We apply this benchmark to assess 103 frontier LMs using five evaluator LMs. Our code, data, and evaluation results are all publicly available at https://github.com/prometheus-eval/prometheus-eval/tree/main/BiGGen-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05761v1-abstract-full').style.display = 'none'; document.getElementById('2406.05761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17245">arXiv:2405.17245</a> <span> [<a href="https://arxiv.org/pdf/2405.17245">pdf</a>, <a href="https://arxiv.org/format/2405.17245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Galaxy: A Resource-Efficient Collaborative Edge AI System for In-situ Transformer Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyuan Ye</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiangsu Du</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Liekang Zeng</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+W">Wenzhong Ou</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yutong Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17245v1-abstract-short" style="display: inline;"> Transformer-based models have unlocked a plethora of powerful intelligent applications at the edge, such as voice assistant in smart home. Traditional deployment approaches offload the inference workloads to the remote cloud server, which would induce substantial pressure on the backbone network as well as raise users' privacy concerns. To address that, in-situ inference has been recently recogniz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17245v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17245v1-abstract-full" style="display: none;"> Transformer-based models have unlocked a plethora of powerful intelligent applications at the edge, such as voice assistant in smart home. Traditional deployment approaches offload the inference workloads to the remote cloud server, which would induce substantial pressure on the backbone network as well as raise users' privacy concerns. To address that, in-situ inference has been recently recognized for edge intelligence, but it still confronts significant challenges stemming from the conflict between intensive workloads and limited on-device computing resources. In this paper, we leverage our observation that many edge environments usually comprise a rich set of accompanying trusted edge devices with idle resources and propose Galaxy, a collaborative edge AI system that breaks the resource walls across heterogeneous edge devices for efficient Transformer inference acceleration. Galaxy introduces a novel hybrid model parallelism to orchestrate collaborative inference, along with a heterogeneity-aware parallelism planning for fully exploiting the resource potential. Furthermore, Galaxy devises a tile-based fine-grained overlapping of communication and computation to mitigate the impact of tensor synchronizations on inference latency under bandwidth-constrained edge environments. Extensive evaluation based on prototype implementation demonstrates that Galaxy remarkably outperforms state-of-the-art approaches under various edge environment setups, achieving up to 2.5x end-to-end latency reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17245v1-abstract-full').style.display = 'none'; document.getElementById('2405.17245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE International Conference on Computer Communications 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05498">arXiv:2405.05498</a> <span> [<a href="https://arxiv.org/pdf/2405.05498">pdf</a>, <a href="https://arxiv.org/format/2405.05498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The RoyalFlush Automatic Speech Diarization and Recognition System for In-Car Multi-Channel Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jingguang Tian</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shuaishuai Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shunfei Chen</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Y">Yang Xiang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhaohui Yin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinhui Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinkang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05498v1-abstract-short" style="display: inline;"> This paper presents our system submission for the In-Car Multi-Channel Automatic Speech Recognition (ICMC-ASR) Challenge, which focuses on speaker diarization and speech recognition in complex multi-speaker scenarios. To address these challenges, we develop end-to-end speaker diarization models that notably decrease the diarization error rate (DER) by 49.58\% compared to the official baseline on t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05498v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05498v1-abstract-full" style="display: none;"> This paper presents our system submission for the In-Car Multi-Channel Automatic Speech Recognition (ICMC-ASR) Challenge, which focuses on speaker diarization and speech recognition in complex multi-speaker scenarios. To address these challenges, we develop end-to-end speaker diarization models that notably decrease the diarization error rate (DER) by 49.58\% compared to the official baseline on the development set. For speech recognition, we utilize self-supervised learning representations to train end-to-end ASR models. By integrating these models, we achieve a character error rate (CER) of 16.93\% on the track 1 evaluation set, and a concatenated minimum permutation character error rate (cpCER) of 25.88\% on the track 2 evaluation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05498v1-abstract-full').style.display = 'none'; document.getElementById('2405.05498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04434">arXiv:2405.04434</a> <span> [<a href="https://arxiv.org/pdf/2405.04434">pdf</a>, <a href="https://arxiv.org/format/2405.04434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Dengr%2C+C">Chengqi Dengr</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hanwei Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Honghui Ding</a> , et al. (132 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04434v5-abstract-short" style="display: inline;"> We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04434v5-abstract-full').style.display = 'inline'; document.getElementById('2405.04434v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04434v5-abstract-full" style="display: none;"> We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference through significantly compressing the Key-Value (KV) cache into a latent vector, while DeepSeekMoE enables training strong models at an economical cost through sparse computation. Compared with DeepSeek 67B, DeepSeek-V2 achieves significantly stronger performance, and meanwhile saves 42.5% of training costs, reduces the KV cache by 93.3%, and boosts the maximum generation throughput to 5.76 times. We pretrain DeepSeek-V2 on a high-quality and multi-source corpus consisting of 8.1T tokens, and further perform Supervised Fine-Tuning (SFT) and Reinforcement Learning (RL) to fully unlock its potential. Evaluation results show that, even with only 21B activated parameters, DeepSeek-V2 and its chat versions still achieve top-tier performance among open-source models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04434v5-abstract-full').style.display = 'none'; document.getElementById('2405.04434v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03567">arXiv:2405.03567</a> <span> [<a href="https://arxiv.org/pdf/2405.03567">pdf</a>, <a href="https://arxiv.org/format/2405.03567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Deep Space Separable Distillation for Lightweight Acoustic Scene Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">ShuQi Ye</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuan Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03567v1-abstract-short" style="display: inline;"> Acoustic scene classification (ASC) is highly important in the real world. Recently, deep learning-based methods have been widely employed for acoustic scene classification. However, these methods are currently not lightweight enough as well as their performance is not satisfactory. To solve these problems, we propose a deep space separable distillation network. Firstly, the network performs high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03567v1-abstract-full').style.display = 'inline'; document.getElementById('2405.03567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03567v1-abstract-full" style="display: none;"> Acoustic scene classification (ASC) is highly important in the real world. Recently, deep learning-based methods have been widely employed for acoustic scene classification. However, these methods are currently not lightweight enough as well as their performance is not satisfactory. To solve these problems, we propose a deep space separable distillation network. Firstly, the network performs high-low frequency decomposition on the log-mel spectrogram, significantly reducing computational complexity while maintaining model performance. Secondly, we specially design three lightweight operators for ASC, including Separable Convolution (SC), Orthonormal Separable Convolution (OSC), and Separable Partial Convolution (SPC). These operators exhibit highly efficient feature extraction capabilities in acoustic scene classification tasks. The experimental results demonstrate that the proposed method achieves a performance gain of 9.8% compared to the currently popular deep learning methods, while also having smaller parameter count and computational complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03567v1-abstract-full').style.display = 'none'; document.getElementById('2405.03567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17766">arXiv:2404.17766</a> <span> [<a href="https://arxiv.org/pdf/2404.17766">pdf</a>, <a href="https://arxiv.org/format/2404.17766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Implementation of Big AI Models for Wireless Networks with Collaborative Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Liekang Zeng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shengyuan Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17766v1-abstract-short" style="display: inline;"> Big Artificial Intelligence (AI) models have emerged as a crucial element in various intelligent applications at the edge, such as voice assistants in smart homes and autonomous robotics in smart factories. Training big AI models, e.g., for personalized fine-tuning and continual model refinement, poses significant challenges to edge devices due to the inherent conflict between limited computing re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17766v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17766v1-abstract-full" style="display: none;"> Big Artificial Intelligence (AI) models have emerged as a crucial element in various intelligent applications at the edge, such as voice assistants in smart homes and autonomous robotics in smart factories. Training big AI models, e.g., for personalized fine-tuning and continual model refinement, poses significant challenges to edge devices due to the inherent conflict between limited computing resources and intensive workload associated with training. Despite the constraints of on-device training, traditional approaches usually resort to aggregating training data and sending it to a remote cloud for centralized training. Nevertheless, this approach is neither sustainable, which strains long-range backhaul transmission and energy-consuming datacenters, nor safely private, which shares users' raw data with remote infrastructures. To address these challenges, we alternatively observe that prevalent edge environments usually contain a diverse collection of trusted edge devices with untapped idle resources, which can be leveraged for edge training acceleration. Motivated by this, in this article, we propose collaborative edge training, a novel training mechanism that orchestrates a group of trusted edge devices as a resource pool for expedited, sustainable big AI model training at the edge. As an initial step, we present a comprehensive framework for building collaborative edge training systems and analyze in-depth its merits and sustainable scheduling choices following its workflow. To further investigate the impact of its parallelism design, we empirically study a case of four typical parallelisms from the perspective of energy demand with realistic testbeds. Finally, we discuss open challenges for sustainable collaborative edge training to point to future directions of edge-centric big AI model training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17766v1-abstract-full').style.display = 'none'; document.getElementById('2404.17766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16821">arXiv:2404.16821</a> <span> [<a href="https://arxiv.org/pdf/2404.16821">pdf</a>, <a href="https://arxiv.org/format/2404.16821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wenwen Tong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kongzhi Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiapeng Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zheng Ma</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Ji Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hang Yan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hewei Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhenjiang Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingjian Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pinlong Cai</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16821v2-abstract-short" style="display: inline;"> In this report, we introduce InternVL 1.5, an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. We introduce three simple improvements: (1) Strong Vision Encoder: we explored a continuous learning strategy for the large-scale vision foundation model -- InternViT-6B, boosting its visual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16821v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16821v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16821v2-abstract-full" style="display: none;"> In this report, we introduce InternVL 1.5, an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. We introduce three simple improvements: (1) Strong Vision Encoder: we explored a continuous learning strategy for the large-scale vision foundation model -- InternViT-6B, boosting its visual understanding capabilities, and making it can be transferred and reused in different LLMs. (2) Dynamic High-Resolution: we divide images into tiles ranging from 1 to 40 of 448$\times$448 pixels according to the aspect ratio and resolution of the input images, which supports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we carefully collected a high-quality bilingual dataset that covers common scenes, document images, and annotated them with English and Chinese question-answer pairs, significantly enhancing performance in OCR- and Chinese-related tasks. We evaluate InternVL 1.5 through a series of benchmarks and comparative studies. Compared to both open-source and proprietary models, InternVL 1.5 shows competitive performance, achieving state-of-the-art results in 8 of 18 benchmarks. Code has been released at https://github.com/OpenGVLab/InternVL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16821v2-abstract-full').style.display = 'none'; document.getElementById('2404.16821v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16418">arXiv:2404.16418</a> <span> [<a href="https://arxiv.org/pdf/2404.16418">pdf</a>, <a href="https://arxiv.org/format/2404.16418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Instruction Matters: A Simple yet Effective Task Selection for Optimized Instruction Tuning of Specific Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+C">Changho Lee</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Janghoon Han</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+S+J">Stanley Jungkyu Choi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Honglak Lee</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+K">Kyunghoon Bae</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16418v2-abstract-short" style="display: inline;"> Instruction tuning has been proven effective in enhancing zero-shot generalization across various tasks and in improving the performance of specific tasks. For task-specific improvements, strategically selecting and training on related tasks that provide meaningful supervision is crucial, as this approach enhances efficiency and prevents performance degradation from learning irrelevant tasks. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16418v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16418v2-abstract-full" style="display: none;"> Instruction tuning has been proven effective in enhancing zero-shot generalization across various tasks and in improving the performance of specific tasks. For task-specific improvements, strategically selecting and training on related tasks that provide meaningful supervision is crucial, as this approach enhances efficiency and prevents performance degradation from learning irrelevant tasks. In this light, we introduce a simple yet effective task selection method that leverages instruction information alone to identify relevant tasks, optimizing instruction tuning for specific tasks. Our method is significantly more efficient than traditional approaches, which require complex measurements of pairwise transferability between tasks or the creation of data samples for the target task. Additionally, by aligning the model with the unique instructional template style of the meta-dataset, we enhance its ability to granularly discern relevant tasks, leading to improved overall performance. Experimental results demonstrate that training on a small set of tasks, chosen solely based on the instructions, results in substantial improvements in performance on benchmarks such as P3, Big-Bench, NIV2, and Big-Bench Hard. Significantly, these improvements surpass those achieved by prior task selection methods, highlighting the superiority of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16418v2-abstract-full').style.display = 'none'; document.getElementById('2404.16418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 (Camera-ready), by Janghoon Han and Changho Lee, with equal contribution</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10346">arXiv:2404.10346</a> <span> [<a href="https://arxiv.org/pdf/2404.10346">pdf</a>, <a href="https://arxiv.org/format/2404.10346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Explore: Enhancing Mathematical Reasoning in Language Models with Fine-grained Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hwang%2C+H">Hyeonbin Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Doyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungone Kim</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10346v4-abstract-short" style="display: inline;"> Training on large amounts of rationales (i.e., CoT Fine-tuning) is effective at improving the reasoning capabilities of large language models (LLMs). However, acquiring human-authored rationales or augmenting rationales from proprietary models is costly and not scalable. In this paper, we study the problem of whether LLMs could self-improve their reasoning capabilities. To this end, we propose Sel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10346v4-abstract-full').style.display = 'inline'; document.getElementById('2404.10346v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10346v4-abstract-full" style="display: none;"> Training on large amounts of rationales (i.e., CoT Fine-tuning) is effective at improving the reasoning capabilities of large language models (LLMs). However, acquiring human-authored rationales or augmenting rationales from proprietary models is costly and not scalable. In this paper, we study the problem of whether LLMs could self-improve their reasoning capabilities. To this end, we propose Self-Explore, where the LLM is tasked to explore the first wrong step (i.e., the first pit) within the rationale and use such signals as fine-grained rewards for further improvement. On the GSM8K and MATH test set, Self-Explore achieves 11.57% and 2.89% improvement on average across three LLMs compared to supervised fine-tuning (SFT). Our code is available at https://github.com/hbin0701/Self-Explore. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10346v4-abstract-full').style.display = 'none'; document.getElementById('2404.10346v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP Findings 2024 Camera Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12982">arXiv:2403.12982</a> <span> [<a href="https://arxiv.org/pdf/2403.12982">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-Reuse Transfer Learning Methods in Molecular and Material Science </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+A">An Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhilong Wang</a>, <a href="/search/cs?searchtype=author&query=Vidaurre%2C+K+L+L">Karl Luigi Loza Vidaurre</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yanqiang Han</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Simin Ye</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+K">Kehao Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jing Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinjin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12982v1-abstract-short" style="display: inline;"> Molecules and materials are the foundation for the development of modern advanced industries such as energy storage systems and semiconductor devices. However, traditional trial-and-error methods or theoretical calculations are highly resource-intensive, and extremely long R&D (Research and Development) periods cannot meet the urgent need for molecules/materials in industrial development. Machine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12982v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12982v1-abstract-full" style="display: none;"> Molecules and materials are the foundation for the development of modern advanced industries such as energy storage systems and semiconductor devices. However, traditional trial-and-error methods or theoretical calculations are highly resource-intensive, and extremely long R&D (Research and Development) periods cannot meet the urgent need for molecules/materials in industrial development. Machine learning (ML) methods based on big data are expected to break this dilemma. However, the difficulty in constructing large-scale datasets of new molecules/materials due to the high cost of data acquisition and annotation limits the development of machine learning. The application of transfer learning lowers the data requirements for model training, which makes transfer learning stand out in researches addressing data quality issues. In this review, we summarize recent advances in transfer learning related to molecular and materials science. We focus on the application of transfer learning methods for the discovery of advanced molecules/materials, particularly, the construction of transfer learning frameworks for different systems, and how transfer learning can enhance the performance of models. In addition, the challenges of transfer learning are also discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12982v1-abstract-full').style.display = 'none'; document.getElementById('2403.12982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10809">arXiv:2403.10809</a> <span> [<a href="https://arxiv.org/pdf/2403.10809">pdf</a>, <a href="https://arxiv.org/format/2403.10809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Efficient Trajectory Forecasting and Generation with Conditional Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sean Ye</a>, <a href="/search/cs?searchtype=author&query=Gombolay%2C+M">Matthew Gombolay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10809v2-abstract-short" style="display: inline;"> Trajectory prediction and generation are crucial for autonomous robots in dynamic environments. While prior research has typically focused on either prediction or generation, our approach unifies these tasks to provide a versatile framework and achieve state-of-the-art performance. While diffusion models excel in trajectory generation, their iterative sampling process is computationally intensive,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10809v2-abstract-full').style.display = 'inline'; document.getElementById('2403.10809v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10809v2-abstract-full" style="display: none;"> Trajectory prediction and generation are crucial for autonomous robots in dynamic environments. While prior research has typically focused on either prediction or generation, our approach unifies these tasks to provide a versatile framework and achieve state-of-the-art performance. While diffusion models excel in trajectory generation, their iterative sampling process is computationally intensive, hindering robotic systems' dynamic capabilities. We introduce Trajectory Conditional Flow Matching (T-CFM), a novel approach using flow matching techniques to learn a solver time-varying vector field for efficient, fast trajectory generation. T-CFM demonstrates effectiveness in adversarial tracking, real-world aircraft trajectory forecasting, and long-horizon planning, outperforming state-of-the-art baselines with 35% higher predictive accuracy and 142% improved planning performance. Crucially, T-CFM achieves up to 100$\times$ speed-up compared to diffusion models without sacrificing accuracy, enabling real-time decision making in robotics. Codebase: https://github.com/CORE-Robotics-Lab/TCFM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10809v2-abstract-full').style.display = 'none'; document.getElementById('2403.10809v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ye%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository