Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 380 results for author: <span class="mathjax">Xiong, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xiong%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xiong, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xiong%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xiong, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14347">arXiv:2411.14347</a> <span> [<a href="https://arxiv.org/pdf/2411.14347">pdf</a>, <a href="https://arxiv.org/format/2411.14347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DINO-X: A Unified Vision Model for Open-World Object Detection and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+T">Tianhe Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qing Jiang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhaoyang Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuda Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenlong Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junyi Shen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoke Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhuheng Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Han Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kent Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14347v1-abstract-short" style="display: inline;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14347v1-abstract-full" style="display: none;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extends its input options to support text prompt, visual prompt, and customized prompt. With such flexible prompt options, we develop a universal object prompt to support prompt-free open-world detection, making it possible to detect anything in an image without requiring users to provide any prompt. To enhance the model's core grounding capability, we have constructed a large-scale dataset with over 100 million high-quality grounding samples, referred to as Grounding-100M, for advancing the model's open-vocabulary detection performance. Pre-training on such a large-scale grounding dataset leads to a foundational object-level representation, which enables DINO-X to integrate multiple perception heads to simultaneously support multiple object perception and understanding tasks, including detection, segmentation, pose estimation, object captioning, object-based QA, etc. Experimental results demonstrate the superior performance of DINO-X. Specifically, the DINO-X Pro model achieves 56.0 AP, 59.8 AP, and 52.4 AP on the COCO, LVIS-minival, and LVIS-val zero-shot object detection benchmarks, respectively. Notably, it scores 63.3 AP and 56.5 AP on the rare classes of LVIS-minival and LVIS-val benchmarks, both improving the previous SOTA performance by 5.8 AP. Such a result underscores its significantly improved capacity for recognizing long-tailed objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'none'; document.getElementById('2411.14347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14199">arXiv:2411.14199</a> <span> [<a href="https://arxiv.org/pdf/2411.14199">pdf</a>, <a href="https://arxiv.org/format/2411.14199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OpenScholar: Synthesizing Scientific Literature with Retrieval-augmented LMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jacqueline He</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+R">Rulin Shao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J+C">Joseph Chee Chang</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Feldman%2C+S">Sergey Feldman</a>, <a href="/search/cs?searchtype=author&query=D%27arcy%2C+M">Mike D'arcy</a>, <a href="/search/cs?searchtype=author&query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&query=Latzke%2C+M">Matt Latzke</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+M">Minyang Tian</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+P">Pan Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengyan Liu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hao Tong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohao Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yanyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Weld%2C+D">Dan Weld</a>, <a href="/search/cs?searchtype=author&query=Downey%2C+D">Doug Downey</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14199v1-abstract-short" style="display: inline;"> Scientific progress depends on researchers' ability to synthesize the growing body of literature. Can large language models (LMs) assist scientists in this task? We introduce OpenScholar, a specialized retrieval-augmented LM that answers scientific queries by identifying relevant passages from 45 million open-access papers and synthesizing citation-backed responses. To evaluate OpenScholar, we dev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14199v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14199v1-abstract-full" style="display: none;"> Scientific progress depends on researchers' ability to synthesize the growing body of literature. Can large language models (LMs) assist scientists in this task? We introduce OpenScholar, a specialized retrieval-augmented LM that answers scientific queries by identifying relevant passages from 45 million open-access papers and synthesizing citation-backed responses. To evaluate OpenScholar, we develop ScholarQABench, the first large-scale multi-domain benchmark for literature search, comprising 2,967 expert-written queries and 208 long-form answers across computer science, physics, neuroscience, and biomedicine. On ScholarQABench, OpenScholar-8B outperforms GPT-4o by 5% and PaperQA2 by 7% in correctness, despite being a smaller, open model. While GPT4o hallucinates citations 78 to 90% of the time, OpenScholar achieves citation accuracy on par with human experts. OpenScholar's datastore, retriever, and self-feedback inference loop also improves off-the-shelf LMs: for instance, OpenScholar-GPT4o improves GPT-4o's correctness by 12%. In human evaluations, experts preferred OpenScholar-8B and OpenScholar-GPT4o responses over expert-written ones 51% and 70% of the time, respectively, compared to GPT4o's 32%. We open-source all of our code, models, datastore, data and a public demo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14199v1-abstract-full').style.display = 'none'; document.getElementById('2411.14199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11646">arXiv:2411.11646</a> <span> [<a href="https://arxiv.org/pdf/2411.11646">pdf</a>, <a href="https://arxiv.org/format/2411.11646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Can Highlighting Help GitHub Maintainers Track Security Fixes? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xueqing Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuchen Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiushi Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiangrui Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11646v1-abstract-short" style="display: inline;"> In recent years, the rapid growth of security vulnerabilities poses great challenges to tracing and managing them. For example, it was reported that the NVD database experienced significant delays due to the shortage of maintainers. Such delay creates challenges for third-party security personnel (e.g., administrators) to trace the information related to the CVE. To help security personnel trace a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11646v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11646v1-abstract-full" style="display: none;"> In recent years, the rapid growth of security vulnerabilities poses great challenges to tracing and managing them. For example, it was reported that the NVD database experienced significant delays due to the shortage of maintainers. Such delay creates challenges for third-party security personnel (e.g., administrators) to trace the information related to the CVE. To help security personnel trace a vulnerability patch, we build a retrieval system that automatically retrieves the patch in the repository. Inspired by existing work on explainable machine learning, we ask the following research question: can explanations help security maintainers make decisions in patch tracing? First, we investigate using LIME (a widely used explainable machine learning method) to highlight the rationale tokens in the commit message and code. In addition, we propose an explanation method called TfIdf-Highlight, which leverages the Tf-Idf statistics to select the most informative words in the repository and the dataset. We evaluate the effectiveness of highlighting using two experiments. First, we compare LIME and TfIdf-Highlight using a faithfulness score (i.e., sufficiency and comprehensiveness) defined for ranking. We find that TfIdf-Highlight significantly outperforms LIME's sufficiency scores by 15\% and slightly outperforms the comprehensiveness scores. Second, we conduct a blind human labeling experiment by asking the annotators to guess the patch under 3 settings (TfIdf-Highlight, LIME, and no highlight). We find that the helpfulness score for TfIdf-Highlight is higher than LIME while the labeling accuracies of LIME and TfIdf-Highlight are similar. Nevertheless, highlighting does not improve the accuracy over non-highlighting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11646v1-abstract-full').style.display = 'none'; document.getElementById('2411.11646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05504">arXiv:2411.05504</a> <span> [<a href="https://arxiv.org/pdf/2411.05504">pdf</a>, <a href="https://arxiv.org/format/2411.05504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LBPE: Long-token-first Tokenization to Improve Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lian%2C+H">Haoran Lian</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianwei Niu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+S">Shasha Mo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05504v1-abstract-short" style="display: inline;"> The prevalent use of Byte Pair Encoding (BPE) in Large Language Models (LLMs) facilitates robust handling of subword units and avoids issues of out-of-vocabulary words. Despite its success, a critical challenge persists: long tokens, rich in semantic information, have fewer occurrences in tokenized datasets compared to short tokens, which can result in imbalanced learning issue across different to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05504v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05504v1-abstract-full" style="display: none;"> The prevalent use of Byte Pair Encoding (BPE) in Large Language Models (LLMs) facilitates robust handling of subword units and avoids issues of out-of-vocabulary words. Despite its success, a critical challenge persists: long tokens, rich in semantic information, have fewer occurrences in tokenized datasets compared to short tokens, which can result in imbalanced learning issue across different tokens. To address that, we propose LBPE, which prioritizes long tokens during the encoding process. LBPE generates tokens according to their reverse ranks of token length rather than their ranks in the vocabulary, granting longer tokens higher priority during the encoding process. Consequently, LBPE smooths the frequency differences between short and long tokens, and thus mitigates the learning imbalance. Extensive experiments across diverse language modeling tasks demonstrate that LBPE consistently outperforms the original BPE, well demonstrating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05504v1-abstract-full').style.display = 'none'; document.getElementById('2411.05504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2404.17808</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03743">arXiv:2411.03743</a> <span> [<a href="https://arxiv.org/pdf/2411.03743">pdf</a>, <a href="https://arxiv.org/format/2411.03743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Automating Exploratory Proteomics Research via Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+S">Shang Qu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Linhai Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zaoqu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yibai Xiong</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yuxin Zuo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhangren Chen</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Youbang Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+F">Fuchu He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03743v1-abstract-short" style="display: inline;"> With the development of artificial intelligence, its contribution to science is evolving from simulating a complex problem to automating entire research processes and producing novel discoveries. Achieving this advancement requires both specialized general models grounded in real-world scientific data and iterative, exploratory frameworks that mirror human scientific methodologies. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03743v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03743v1-abstract-full" style="display: none;"> With the development of artificial intelligence, its contribution to science is evolving from simulating a complex problem to automating entire research processes and producing novel discoveries. Achieving this advancement requires both specialized general models grounded in real-world scientific data and iterative, exploratory frameworks that mirror human scientific methodologies. In this paper, we present PROTEUS, a fully automated system for scientific discovery from raw proteomics data. PROTEUS uses large language models (LLMs) to perform hierarchical planning, execute specialized bioinformatics tools, and iteratively refine analysis workflows to generate high-quality scientific hypotheses. The system takes proteomics datasets as input and produces a comprehensive set of research objectives, analysis results, and novel biological hypotheses without human intervention. We evaluated PROTEUS on 12 proteomics datasets collected from various biological samples (e.g. immune cells, tumors) and different sample types (single-cell and bulk), generating 191 scientific hypotheses. These were assessed using both automatic LLM-based scoring on 5 metrics and detailed reviews from human experts. Results demonstrate that PROTEUS consistently produces reliable, logically coherent results that align well with existing literature while also proposing novel, evaluable hypotheses. The system's flexible architecture facilitates seamless integration of diverse analysis tools and adaptation to different proteomics data types. By automating complex proteomics analysis workflows and hypothesis generation, PROTEUS has the potential to considerably accelerate the pace of scientific discovery in proteomics research, enabling researchers to efficiently explore large-scale datasets and uncover biological insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03743v1-abstract-full').style.display = 'none'; document.getElementById('2411.03743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23703">arXiv:2410.23703</a> <span> [<a href="https://arxiv.org/pdf/2410.23703">pdf</a>, <a href="https://arxiv.org/format/2410.23703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OCEAN: Offline Chain-of-thought Evaluation and Alignment in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xintong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yu Xia</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuxin Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianing Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Kveton%2C+B">Branislav Kveton</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lina Yao</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jingbo Shang</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23703v1-abstract-short" style="display: inline;"> Offline evaluation of LLMs is crucial in understanding their capacities, though current methods remain underexplored in existing research. In this work, we focus on the offline evaluation of the chain-of-thought capabilities and show how to optimize LLMs based on the proposed evaluation method. To enable offline feedback with rich knowledge and reasoning paths, we use knowledge graphs (e.g., Wikid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23703v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23703v1-abstract-full" style="display: none;"> Offline evaluation of LLMs is crucial in understanding their capacities, though current methods remain underexplored in existing research. In this work, we focus on the offline evaluation of the chain-of-thought capabilities and show how to optimize LLMs based on the proposed evaluation method. To enable offline feedback with rich knowledge and reasoning paths, we use knowledge graphs (e.g., Wikidata5m) to provide feedback on the generated chain of thoughts. Due to the heterogeneity between LLM reasoning and KG structures, direct interaction and feedback from KGs on LLM behavior are challenging, as they require accurate entity linking and grounding of LLM-generated chains of thought in the KG. To address the above challenge, we propose an offline chain-of-thought evaluation framework, OCEAN, which models chain-of-thought reasoning in LLMs as an MDP and evaluate the policy's alignment with KG preference modeling. To overcome the reasoning heterogeneity and grounding problems, we leverage on-policy KG exploration and RL to model a KG policy that generates token-level likelihood distributions for LLM-generated chain-of-thought reasoning paths, simulating KG reasoning preference. Then we incorporate the knowledge-graph feedback on the validity and alignment of the generated reasoning paths into inverse propensity scores and propose KG-IPS estimator. Theoretically, we prove the unbiasedness of the proposed KG-IPS estimator and provide a lower bound on its variance. With the off-policy evaluated value function, we can directly enable off-policy optimization to further enhance chain-of-thought alignment. Our empirical study shows that OCEAN can be efficiently optimized for generating chain-of-thought reasoning paths with higher estimated values without affecting LLMs' general abilities in downstream tasks or their internal knowledge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23703v1-abstract-full').style.display = 'none'; document.getElementById('2410.23703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17637">arXiv:2410.17637</a> <span> [<a href="https://arxiv.org/pdf/2410.17637">pdf</a>, <a href="https://arxiv.org/format/2410.17637">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MIA-DPO: Multi-Image Augmented Direct Preference Optimization For Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yuhang Zang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuhang Cao</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+H">Haodong Duan</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuanjun Xiong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17637v1-abstract-short" style="display: inline;"> Visual preference alignment involves training Large Vision-Language Models (LVLMs) to predict human preferences between visual inputs. This is typically achieved by using labeled datasets of chosen/rejected pairs and employing optimization algorithms like direct preference optimization (DPO). Existing visual alignment methods, primarily designed for single-image scenarios, struggle to effectively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17637v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17637v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17637v1-abstract-full" style="display: none;"> Visual preference alignment involves training Large Vision-Language Models (LVLMs) to predict human preferences between visual inputs. This is typically achieved by using labeled datasets of chosen/rejected pairs and employing optimization algorithms like direct preference optimization (DPO). Existing visual alignment methods, primarily designed for single-image scenarios, struggle to effectively handle the complexity of multi-image tasks due to the scarcity of diverse training data and the high cost of annotating chosen/rejected pairs. We present Multi-Image Augmented Direct Preference Optimization (MIA-DPO), a visual preference alignment approach that effectively handles multi-image inputs. MIA-DPO mitigates the scarcity of diverse multi-image training data by extending single-image data with unrelated images arranged in grid collages or pic-in-pic formats, significantly reducing the costs associated with multi-image data annotations. Our observation reveals that attention values of LVLMs vary considerably across different images. We use attention values to identify and filter out rejected responses the model may have mistakenly focused on. Our attention-aware selection for constructing the chosen/rejected pairs without relying on (i) human annotation, (ii) extra data, and (iii) external models or APIs. MIA-DPO is compatible with various architectures and outperforms existing methods on five multi-image benchmarks, achieving an average performance boost of 3.0% on LLaVA-v1.5 and 4.3% on the recent InternLM-XC2.5. Moreover, MIA-DPO has a minimal effect on the model's ability to understand single images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17637v1-abstract-full').style.display = 'none'; document.getElementById('2410.17637v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project URL: https://github.com/Liuziyu77/MIA-DPO</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17434">arXiv:2410.17434</a> <span> [<a href="https://arxiv.org/pdf/2410.17434">pdf</a>, <a href="https://arxiv.org/format/2410.17434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaoqian Shen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yunyang Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Changsheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lemeng Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenchen Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zechun Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Fanyi Xiao</a>, <a href="/search/cs?searchtype=author&query=Varadarajan%2C+B">Balakrishnan Varadarajan</a>, <a href="/search/cs?searchtype=author&query=Bordes%2C+F">Florian Bordes</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a>, <a href="/search/cs?searchtype=author&query=Soran%2C+B">Bilge Soran</a>, <a href="/search/cs?searchtype=author&query=Krishnamoorthi%2C+R">Raghuraman Krishnamoorthi</a>, <a href="/search/cs?searchtype=author&query=Elhoseiny%2C+M">Mohamed Elhoseiny</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17434v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have shown promising progress in understanding and analyzing video content. However, processing long videos remains a significant challenge constrained by LLM's context size. To address this limitation, we propose LongVU, a spatiotemporal adaptive compression mechanism thats reduces the number of video tokens while preserving visual details of long videos.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17434v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17434v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have shown promising progress in understanding and analyzing video content. However, processing long videos remains a significant challenge constrained by LLM's context size. To address this limitation, we propose LongVU, a spatiotemporal adaptive compression mechanism thats reduces the number of video tokens while preserving visual details of long videos. Our idea is based on leveraging cross-modal query and inter-frame dependencies to adaptively reduce temporal and spatial redundancy in videos. Specifically, we leverage DINOv2 features to remove redundant frames that exhibit high similarity. Then we utilize text-guided cross-modal query for selective frame feature reduction. Further, we perform spatial token reduction across frames based on their temporal dependencies. Our adaptive compression strategy effectively processes a large number of frames with little visual information loss within given context length. Our LongVU consistently surpass existing methods across a variety of video understanding benchmarks, especially on hour-long video understanding tasks such as VideoMME and MLVU. Given a light-weight LLM, our LongVU also scales effectively into a smaller size with state-of-the-art video understanding performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17434v1-abstract-full').style.display = 'none'; document.getElementById('2410.17434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://vision-cair.github.io/LongVU</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16077">arXiv:2410.16077</a> <span> [<a href="https://arxiv.org/pdf/2410.16077">pdf</a>, <a href="https://arxiv.org/format/2410.16077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CartesianMoE: Boosting Knowledge Sharing among Experts via Cartesian Product Routing in Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhenpeng Su</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xing Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+M">Minxuan Lv</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guangyuan Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songlin Hu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16077v2-abstract-short" style="display: inline;"> Large language models (LLM) have been attracting much attention from the community recently, due to their remarkable performance in all kinds of downstream tasks. According to the well-known scaling law, scaling up a dense LLM enhances its capabilities, but also significantly increases the computational complexity. Mixture-of-Experts (MoE) models address that by allowing the model size to grow wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16077v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16077v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16077v2-abstract-full" style="display: none;"> Large language models (LLM) have been attracting much attention from the community recently, due to their remarkable performance in all kinds of downstream tasks. According to the well-known scaling law, scaling up a dense LLM enhances its capabilities, but also significantly increases the computational complexity. Mixture-of-Experts (MoE) models address that by allowing the model size to grow without substantially raising training or inference costs. Yet MoE models face challenges regarding knowledge sharing among experts, making their performance somehow sensitive to routing accuracy. To tackle that, previous works introduced shared experts and combined their outputs with those of the top $K$ routed experts in an ``addition'' manner. In this paper, inspired by collective matrix factorization to learn shared knowledge among data, we propose CartesianMoE, which implements more effective knowledge sharing among experts in more like a ``multiplication'' manner. Extensive experimental results indicate that CartesianMoE outperforms previous MoE models for building LLMs, in terms of both perplexity and downstream task performance. And we also find that CartesianMoE achieves better expert routing robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16077v2-abstract-full').style.display = 'none'; document.getElementById('2410.16077v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15959">arXiv:2410.15959</a> <span> [<a href="https://arxiv.org/pdf/2410.15959">pdf</a>, <a href="https://arxiv.org/format/2410.15959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Transformer Policy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhi Hou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuwen Xiong</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+H">Hengjun Pu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+R">Ronglei Tong</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuntao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15959v1-abstract-short" style="display: inline;"> Recent large visual-language action models pretrained on diverse robot datasets have demonstrated the potential for generalizing to new environments with a few in-domain data. However, those approaches usually predict discretized or continuous actions by a small action head, which limits the ability in handling diverse action spaces. In contrast, we model the continuous action with a large multi-m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15959v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15959v1-abstract-full" style="display: none;"> Recent large visual-language action models pretrained on diverse robot datasets have demonstrated the potential for generalizing to new environments with a few in-domain data. However, those approaches usually predict discretized or continuous actions by a small action head, which limits the ability in handling diverse action spaces. In contrast, we model the continuous action with a large multi-modal diffusion transformer, dubbed as Diffusion Transformer Policy, in which we directly denoise action chunks by a large transformer model rather than a small action head. By leveraging the scaling capability of transformers, the proposed approach can effectively model continuous end-effector actions across large diverse robot datasets, and achieve better generalization performance. Extensive experiments demonstrate Diffusion Transformer Policy pretrained on diverse robot data can generalize to different embodiments, including simulation environments like Maniskill2 and Calvin, as well as the real-world Franka arm. Specifically, without bells and whistles, the proposed approach achieves state-of-the-art performance with only a single third-view camera stream in the Calvin novel task setting (ABC->D), improving the average number of tasks completed in a row of 5 to 3.6, and the pretraining stage significantly facilitates the success sequence length on the Calvin by over 1.2. The code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15959v1-abstract-full').style.display = 'none'; document.getElementById('2410.15959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13597">arXiv:2410.13597</a> <span> [<a href="https://arxiv.org/pdf/2410.13597">pdf</a>, <a href="https://arxiv.org/format/2410.13597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text-Guided Multi-Property Molecular Optimization with a Diffusion Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yida Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jia Wu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenbin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13597v1-abstract-short" style="display: inline;"> Molecular optimization (MO) is a crucial stage in drug discovery in which task-oriented generated molecules are optimized to meet practical industrial requirements. Existing mainstream MO approaches primarily utilize external property predictors to guide iterative property optimization. However, learning all molecular samples in the vast chemical space is unrealistic for predictors. As a result, e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13597v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13597v1-abstract-full" style="display: none;"> Molecular optimization (MO) is a crucial stage in drug discovery in which task-oriented generated molecules are optimized to meet practical industrial requirements. Existing mainstream MO approaches primarily utilize external property predictors to guide iterative property optimization. However, learning all molecular samples in the vast chemical space is unrealistic for predictors. As a result, errors and noise are inevitably introduced during property prediction due to the nature of approximation. This leads to discrepancy accumulation, generalization reduction and suboptimal molecular candidates. In this paper, we propose a text-guided multi-property molecular optimization method utilizing transformer-based diffusion language model (TransDLM). TransDLM leverages standardized chemical nomenclature as semantic representations of molecules and implicitly embeds property requirements into textual descriptions, thereby preventing error propagation during diffusion process. Guided by physically and chemically detailed textual descriptions, TransDLM samples and optimizes encoded source molecules, retaining core scaffolds of source molecules and ensuring structural similarities. Moreover, TransDLM enables simultaneous sampling of multiple molecules, making it ideal for scalable, efficient large-scale optimization through distributed computation on web platforms. Furthermore, our approach surpasses state-of-the-art methods in optimizing molecular structural similarity and enhancing chemical properties on the benchmark dataset. The code is available at: https://anonymous.4open.science/r/TransDLM-A901. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13597v1-abstract-full').style.display = 'none'; document.getElementById('2410.13597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11744">arXiv:2410.11744</a> <span> [<a href="https://arxiv.org/pdf/2410.11744">pdf</a>, <a href="https://arxiv.org/format/2410.11744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DySpec: Faster Speculative Decoding with Dynamic Token Tree Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yunfan Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanzeng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+L">Lei Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11744v1-abstract-short" style="display: inline;"> While speculative decoding has recently appeared as a promising direction for accelerating the inference of large language models (LLMs), the speedup and scalability are strongly bounded by the token acceptance rate. Prevalent methods usually organize predicted tokens as independent chains or fixed token trees, which fails to generalize to diverse query distributions. In this paper, we propose DyS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11744v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11744v1-abstract-full" style="display: none;"> While speculative decoding has recently appeared as a promising direction for accelerating the inference of large language models (LLMs), the speedup and scalability are strongly bounded by the token acceptance rate. Prevalent methods usually organize predicted tokens as independent chains or fixed token trees, which fails to generalize to diverse query distributions. In this paper, we propose DySpec, a faster speculative decoding algorithm with a novel dynamic token tree structure. We begin by bridging the draft distribution and acceptance rate from intuitive and empirical clues, and successfully show that the two variables are strongly correlated. Based on this, we employ a greedy strategy to dynamically expand the token tree at run time. Theoretically, we show that our method can achieve optimal results under mild assumptions. Empirically, DySpec yields a higher acceptance rate and speedup than fixed trees. DySpec can drastically improve the throughput and reduce the latency of token generation across various data distribution and model sizes, which significantly outperforms strong competitors, including Specinfer and Sequoia. Under low temperature setting, DySpec can improve the throughput up to 9.1$\times$ and reduce the latency up to 9.4$\times$ on Llama2-70B. Under high temperature setting, DySpec can also improve the throughput up to 6.21$\times$, despite the increasing difficulty of speculating more than one token per step for draft model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11744v1-abstract-full').style.display = 'none'; document.getElementById('2410.11744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11008">arXiv:2410.11008</a> <span> [<a href="https://arxiv.org/pdf/2410.11008">pdf</a>, <a href="https://arxiv.org/format/2410.11008">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> V2I-Calib++: A Multi-terminal Spatial Calibration Approach in Urban Intersections for Collaborative Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qianxin Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yijin Xiong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shichun Guo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Ziqiang Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11008v1-abstract-short" style="display: inline;"> Urban intersections, dense with pedestrian and vehicular traffic and compounded by GPS signal obstructions from high-rise buildings, are among the most challenging areas in urban traffic systems. Traditional single-vehicle intelligence systems often perform poorly in such environments due to a lack of global traffic flow information and the ability to respond to unexpected events. Vehicle-to-Every… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11008v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11008v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11008v1-abstract-full" style="display: none;"> Urban intersections, dense with pedestrian and vehicular traffic and compounded by GPS signal obstructions from high-rise buildings, are among the most challenging areas in urban traffic systems. Traditional single-vehicle intelligence systems often perform poorly in such environments due to a lack of global traffic flow information and the ability to respond to unexpected events. Vehicle-to-Everything (V2X) technology, through real-time communication between vehicles (V2V) and vehicles to infrastructure (V2I), offers a robust solution. However, practical applications still face numerous challenges. Calibration among heterogeneous vehicle and infrastructure endpoints in multi-end LiDAR systems is crucial for ensuring the accuracy and consistency of perception system data. Most existing multi-end calibration methods rely on initial calibration values provided by positioning systems, but the instability of GPS signals due to high buildings in urban canyons poses severe challenges to these methods. To address this issue, this paper proposes a novel multi-end LiDAR system calibration method that does not require positioning priors to determine initial external parameters and meets real-time requirements. Our method introduces an innovative multi-end perception object association technique, utilizing a new Overall Distance metric (oDist) to measure the spatial association between perception objects, and effectively combines global consistency search algorithms with optimal transport theory. By this means, we can extract co-observed targets from object association results for further external parameter computation and optimization. Extensive comparative and ablation experiments conducted on the simulated dataset V2X-Sim and the real dataset DAIR-V2X confirm the effectiveness and efficiency of our method. The code for this method can be accessed at: \url{https://github.com/MassimoQu/v2i-calib}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11008v1-abstract-full').style.display = 'none'; document.getElementById('2410.11008v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10934">arXiv:2410.10934</a> <span> [<a href="https://arxiv.org/pdf/2410.10934">pdf</a>, <a href="https://arxiv.org/format/2410.10934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Agent-as-a-Judge: Evaluate Agents with Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuge%2C+M">Mingchen Zhuge</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Changsheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Ashley%2C+D">Dylan Ashley</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenyi Wang</a>, <a href="/search/cs?searchtype=author&query=Khizbullin%2C+D">Dmitrii Khizbullin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yunyang Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zechun Liu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/cs?searchtype=author&query=Krishnamoorthi%2C+R">Raghuraman Krishnamoorthi</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a>, <a href="/search/cs?searchtype=author&query=Schmidhuber%2C+J">J眉rgen Schmidhuber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10934v2-abstract-short" style="display: inline;"> Contemporary evaluation techniques are inadequate for agentic systems. These approaches either focus exclusively on final outcomes -- ignoring the step-by-step nature of agentic systems, or require excessive manual labour. To address this, we introduce the Agent-as-a-Judge framework, wherein agentic systems are used to evaluate agentic systems. This is an organic extension of the LLM-as-a-Judge fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10934v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10934v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10934v2-abstract-full" style="display: none;"> Contemporary evaluation techniques are inadequate for agentic systems. These approaches either focus exclusively on final outcomes -- ignoring the step-by-step nature of agentic systems, or require excessive manual labour. To address this, we introduce the Agent-as-a-Judge framework, wherein agentic systems are used to evaluate agentic systems. This is an organic extension of the LLM-as-a-Judge framework, incorporating agentic features that enable intermediate feedback for the entire task-solving process. We apply the Agent-as-a-Judge to the task of code generation. To overcome issues with existing benchmarks and provide a proof-of-concept testbed for Agent-as-a-Judge, we present DevAI, a new benchmark of 55 realistic automated AI development tasks. It includes rich manual annotations, like a total of 365 hierarchical user requirements. We benchmark three of the popular agentic systems using Agent-as-a-Judge and find it dramatically outperforms LLM-as-a-Judge and is as reliable as our human evaluation baseline. Altogether, we believe that Agent-as-a-Judge marks a concrete step forward for modern agentic systems -- by providing rich and reliable reward signals necessary for dynamic and scalable self-improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10934v2-abstract-full').style.display = 'none'; document.getElementById('2410.10934v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project can be found at https://github.com/metauto-ai/agent-as-a-judge. The dataset is released at https://huggingface.co/DEVAI-benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10267">arXiv:2410.10267</a> <span> [<a href="https://arxiv.org/pdf/2410.10267">pdf</a>, <a href="https://arxiv.org/format/2410.10267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> big.LITTLE Vision Transformer for Efficient Visual Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">He Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulong Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zixuan Ye</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuwen Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10267v1-abstract-short" style="display: inline;"> In this paper, we introduce the big.LITTLE Vision Transformer, an innovative architecture aimed at achieving efficient visual recognition. This dual-transformer system is composed of two distinct blocks: the big performance block, characterized by its high capacity and substantial computational demands, and the LITTLE efficiency block, designed for speed with lower capacity. The key innovation of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10267v1-abstract-full" style="display: none;"> In this paper, we introduce the big.LITTLE Vision Transformer, an innovative architecture aimed at achieving efficient visual recognition. This dual-transformer system is composed of two distinct blocks: the big performance block, characterized by its high capacity and substantial computational demands, and the LITTLE efficiency block, designed for speed with lower capacity. The key innovation of our approach lies in its dynamic inference mechanism. When processing an image, our system determines the importance of each token and allocates them accordingly: essential tokens are processed by the high-performance big model, while less critical tokens are handled by the more efficient little model. This selective processing significantly reduces computational load without sacrificing the overall performance of the model, as it ensures that detailed analysis is reserved for the most important information. To validate the effectiveness of our big.LITTLE Vision Transformer, we conducted comprehensive experiments on image classification and segment anything task. Our results demonstrate that the big.LITTLE architecture not only maintains high accuracy but also achieves substantial computational savings. Specifically, our approach enables the efficient handling of large-scale visual recognition tasks by dynamically balancing the trade-offs between performance and efficiency. The success of our method underscores the potential of hybrid models in optimizing both computation and performance in visual recognition tasks, paving the way for more practical and scalable deployment of advanced neural networks in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10267v1-abstract-full').style.display = 'none'; document.getElementById('2410.10267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06157">arXiv:2410.06157</a> <span> [<a href="https://arxiv.org/pdf/2410.06157">pdf</a>, <a href="https://arxiv.org/format/2410.06157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Detecting Android Malware by Visualizing App Behaviors from Multiple Complementary Views </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zhaoyi Meng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiale Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaqi Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wansen Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenchao Huang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jie Cui</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hong Zhong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06157v1-abstract-short" style="display: inline;"> Deep learning has emerged as a promising technology for achieving Android malware detection. To further unleash its detection potentials, software visualization can be integrated for analyzing the details of app behaviors clearly. However, facing increasingly sophisticated malware, existing visualization-based methods, analyzing from one or randomly-selected few views, can only detect limited atta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06157v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06157v1-abstract-full" style="display: none;"> Deep learning has emerged as a promising technology for achieving Android malware detection. To further unleash its detection potentials, software visualization can be integrated for analyzing the details of app behaviors clearly. However, facing increasingly sophisticated malware, existing visualization-based methods, analyzing from one or randomly-selected few views, can only detect limited attack types. We propose and implement LensDroid, a novel technique that detects Android malware by visualizing app behaviors from multiple complementary views. Our goal is to harness the power of combining deep learning and software visualization to automatically capture and aggregate high-level features that are not inherently linked, thereby revealing hidden maliciousness of Android app behaviors. To thoroughly comprehend the details of apps, we visualize app behaviors from three related but distinct views of behavioral sensitivities, operational contexts and supported environments. We then extract high-order semantics based on the views accordingly. To exploit semantic complementarity of the views, we design a deep neural network based model for fusing the visualized features from local to global based on their contributions to downstream tasks. A comprehensive comparison with five baseline techniques is performed on datasets of more than 51K apps in three real-world typical scenarios, including overall threats, app evolution and zero-day malware. The experimental results show that the overall performance of LensDroid is better than the baseline techniques. We also validate the complementarity of the views and demonstrate that the multi-view fusion in LensDroid enhances Android malware detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06157v1-abstract-full').style.display = 'none'; document.getElementById('2410.06157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TIFS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04797">arXiv:2410.04797</a> <span> [<a href="https://arxiv.org/pdf/2410.04797">pdf</a>, <a href="https://arxiv.org/format/2410.04797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Attentive-based Multi-level Feature Fusion for Voice Disorder Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+L">Lipeng Shen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yifan Xiong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dongyue Guo</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+W">Wei Mo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lingyu Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hui Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04797v1-abstract-short" style="display: inline;"> Voice disorders negatively impact the quality of daily life in various ways. However, accurately recognizing the category of pathological features from raw audio remains a considerable challenge due to the limited dataset. A promising method to handle this issue is extracting multi-level pathological information from speech in a comprehensive manner by fusing features in the latent space. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04797v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04797v1-abstract-full" style="display: none;"> Voice disorders negatively impact the quality of daily life in various ways. However, accurately recognizing the category of pathological features from raw audio remains a considerable challenge due to the limited dataset. A promising method to handle this issue is extracting multi-level pathological information from speech in a comprehensive manner by fusing features in the latent space. In this paper, a novel framework is designed to explore the way of high-quality feature fusion for effective and generalized detection performance. Specifically, the proposed model follows a two-stage training paradigm: (1) ECAPA-TDNN and Wav2vec 2.0 which have shown remarkable effectiveness in various domains are employed to learn the universal pathological information from raw audio; (2) An attentive fusion module is dedicatedly designed to establish the interaction between pathological features projected by EcapTdnn and Wav2vec 2.0 respectively and guide the multi-layer fusion, the entire model is jointly fine-tuned from pre-trained features by the automatic voice pathology detection task. Finally, comprehensive experiments on the FEMH and SVD datasets demonstrate that the proposed framework outperforms the competitive baselines, and achieves the accuracy of 90.51% and 87.68%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04797v1-abstract-full').style.display = 'none'; document.getElementById('2410.04797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00428">arXiv:2410.00428</a> <span> [<a href="https://arxiv.org/pdf/2410.00428">pdf</a>, <a href="https://arxiv.org/format/2410.00428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LayerKV: Optimizing Large Language Model Serving with Layer-wise KV Cache Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yi Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+C">Changxu Shao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuhong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junping Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhenxuan Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00428v3-abstract-short" style="display: inline;"> The expanding context windows in large language models (LLMs) have greatly enhanced their capabilities in various applications, but they also introduce significant challenges in maintaining low latency, particularly in Time to First Token (TTFT). This paper identifies that the sharp rise in TTFT as context length increases is predominantly driven by queuing delays, which are caused by the growing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00428v3-abstract-full').style.display = 'inline'; document.getElementById('2410.00428v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00428v3-abstract-full" style="display: none;"> The expanding context windows in large language models (LLMs) have greatly enhanced their capabilities in various applications, but they also introduce significant challenges in maintaining low latency, particularly in Time to First Token (TTFT). This paper identifies that the sharp rise in TTFT as context length increases is predominantly driven by queuing delays, which are caused by the growing demands for GPU Key-Value (KV) cache allocation clashing with the limited availability of KV cache blocks. To address this issue, we propose LayerKV, a simple yet effective plug-in method that effectively reduces TTFT without requiring additional hardware or compromising output performance, while seamlessly integrating with existing parallelism strategies and scheduling techniques. Specifically, LayerKV introduces layer-wise KV block allocation, management, and offloading for fine-grained control over system memory, coupled with an SLO-aware scheduler to optimize overall Service Level Objectives (SLOs). Comprehensive evaluations on representative models, ranging from 7B to 70B parameters, across various GPU configurations, demonstrate that LayerKV improves TTFT latency up to 69x and reduces SLO violation rates by 28.7%, significantly enhancing the user experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00428v3-abstract-full').style.display = 'none'; document.getElementById('2410.00428v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures, 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11; C.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20343">arXiv:2409.20343</a> <span> [<a href="https://arxiv.org/pdf/2409.20343">pdf</a>, <a href="https://arxiv.org/format/2409.20343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Demystifying and Assessing Code Understandability in Java Decompilation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+R">Ruixin Qin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yifan Xiong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yifei Lu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+M">Minxue Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20343v1-abstract-short" style="display: inline;"> Decompilation, the process of converting machine-level code into readable source code, plays a critical role in reverse engineering. Given that the main purpose of decompilation is to facilitate code comprehension in scenarios where the source code is unavailable, the understandability of decompiled code is of great importance. In this paper, we propose the first empirical study on the understanda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20343v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20343v1-abstract-full" style="display: none;"> Decompilation, the process of converting machine-level code into readable source code, plays a critical role in reverse engineering. Given that the main purpose of decompilation is to facilitate code comprehension in scenarios where the source code is unavailable, the understandability of decompiled code is of great importance. In this paper, we propose the first empirical study on the understandability of Java decompiled code and obtained the following findings: (1) Understandability of Java decompilation is considered as important as its correctness, and decompilation understandability issues are even more commonly encountered than decompilation failures. (2) A notable percentage of code snippets decompiled by Java decompilers exhibit significantly lower or higher levels of understandability in comparison to their original source code. (3) Unfortunately, Cognitive Complexity demonstrates relatively acceptable precision while low recall in recognizing these code snippets exhibiting diverse understandability during decompilation. (4) Even worse, perplexity demonstrates lower levels of precision and recall in recognizing such code snippets. Inspired by the four findings, we further proposed six code patterns and the first metric for the assessment of decompiled code understandability. This metric was extended from Cognitive Complexity, with six more rules harvested from an exhaustive manual analysis into 1287 pairs of source code snippets and corresponding decompiled code. This metric was also validated using the original and updated dataset, yielding an impressive macro F1-score of 0.88 on the original dataset, and 0.86 on the test set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20343v1-abstract-full').style.display = 'none'; document.getElementById('2409.20343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19720">arXiv:2409.19720</a> <span> [<a href="https://arxiv.org/pdf/2409.19720">pdf</a>, <a href="https://arxiv.org/format/2409.19720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FAST: A Dual-tier Few-Shot Learning Paradigm for Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+K">Kexue Fu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiaoyuan Luo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Ying Xiong</a>, <a href="/search/cs?searchtype=author&query=Maglogiannis%2C+I">Ilias Maglogiannis</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Longxiang Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Manning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19720v1-abstract-short" style="display: inline;"> The expensive fine-grained annotation and data scarcity have become the primary obstacles for the widespread adoption of deep learning-based Whole Slide Images (WSI) classification algorithms in clinical practice. Unlike few-shot learning methods in natural images that can leverage the labels of each image, existing few-shot WSI classification methods only utilize a small number of fine-grained la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19720v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19720v1-abstract-full" style="display: none;"> The expensive fine-grained annotation and data scarcity have become the primary obstacles for the widespread adoption of deep learning-based Whole Slide Images (WSI) classification algorithms in clinical practice. Unlike few-shot learning methods in natural images that can leverage the labels of each image, existing few-shot WSI classification methods only utilize a small number of fine-grained labels or weakly supervised slide labels for training in order to avoid expensive fine-grained annotation. They lack sufficient mining of available WSIs, severely limiting WSI classification performance. To address the above issues, we propose a novel and efficient dual-tier few-shot learning paradigm for WSI classification, named FAST. FAST consists of a dual-level annotation strategy and a dual-branch classification framework. Firstly, to avoid expensive fine-grained annotation, we collect a very small number of WSIs at the slide level, and annotate an extremely small number of patches. Then, to fully mining the available WSIs, we use all the patches and available patch labels to build a cache branch, which utilizes the labeled patches to learn the labels of unlabeled patches and through knowledge retrieval for patch classification. In addition to the cache branch, we also construct a prior branch that includes learnable prompt vectors, using the text encoder of visual-language models for patch classification. Finally, we integrate the results from both branches to achieve WSI classification. Extensive experiments on binary and multi-class datasets demonstrate that our proposed method significantly surpasses existing few-shot classification methods and approaches the accuracy of fully supervised methods with only 0.22$\%$ annotation costs. All codes and models will be publicly available on https://github.com/fukexue/FAST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19720v1-abstract-full').style.display = 'none'; document.getElementById('2409.19720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15895">arXiv:2409.15895</a> <span> [<a href="https://arxiv.org/pdf/2409.15895">pdf</a>, <a href="https://arxiv.org/format/2409.15895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Preference-Guided Refactored Tuning for Retrieval Augmented Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinyu Gao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Deze Wang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Z">Zhenhan Guan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zejian Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofen Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shanshan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15895v1-abstract-short" style="display: inline;"> Retrieval-augmented code generation utilizes Large Language Models as the generator and significantly expands their code generation capabilities by providing relevant code, documentation, and more via the retriever. The current approach suffers from two primary limitations: 1) information redundancy. The indiscriminate inclusion of redundant information can result in resource wastage and may misgu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15895v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15895v1-abstract-full" style="display: none;"> Retrieval-augmented code generation utilizes Large Language Models as the generator and significantly expands their code generation capabilities by providing relevant code, documentation, and more via the retriever. The current approach suffers from two primary limitations: 1) information redundancy. The indiscriminate inclusion of redundant information can result in resource wastage and may misguide generators, affecting their effectiveness and efficiency. 2) preference gap. Due to different optimization objectives, the retriever strives to procure code with higher ground truth similarity, yet this effort does not substantially benefit the generator. The retriever and the generator may prefer different golden code, and this gap in preference results in a suboptimal design. Additionally, differences in parameterization knowledge acquired during pre-training result in varying preferences among different generators. To address these limitations, in this paper, we propose RRG (Retrieve, Refactor, Generate), a novel framework for effective and efficient code generation. This framework introduces a code refactorer module between the retriever and the generator to bridge them. The refactoring process transforms the raw retrieved code into a more concise, efficient, and model-friendly version. It eliminates redundant information and noise, reducing the input length. Consequently, the generator receives higher-quality context, enabling it to produce more accurate results with lower inference costs. We conducted comprehensive experiments on multiple datasets. In the experiments, we confirmed the existence of a preference gap between the retriever and the generator, and RRG effectively bridges this gap. Specifically, RRG achieved significant performance improvements, with increases of up to 28% on EM, 13% on BLEU, and 6.8% on CodeBLEU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15895v1-abstract-full').style.display = 'none'; document.getElementById('2409.15895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ASE2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10966">arXiv:2409.10966</a> <span> [<a href="https://arxiv.org/pdf/2409.10966">pdf</a>, <a href="https://arxiv.org/format/2409.10966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CUNSB-RFIE: Context-aware Unpaired Neural Schr枚dinger Bridge in Retinal Fundus Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xuanzhao Dong</a>, <a href="/search/cs?searchtype=author&query=Vasa%2C+V+K">Vamsi Krishna Vasa</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+P">Peijie Qiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiwen Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yujian Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhangsihao Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanxi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10966v1-abstract-short" style="display: inline;"> Retinal fundus photography is significant in diagnosing and monitoring retinal diseases. However, systemic imperfections and operator/patient-related factors can hinder the acquisition of high-quality retinal images. Previous efforts in retinal image enhancement primarily relied on GANs, which are limited by the trade-off between training stability and output diversity. In contrast, the Schr枚dinge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10966v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10966v1-abstract-full" style="display: none;"> Retinal fundus photography is significant in diagnosing and monitoring retinal diseases. However, systemic imperfections and operator/patient-related factors can hinder the acquisition of high-quality retinal images. Previous efforts in retinal image enhancement primarily relied on GANs, which are limited by the trade-off between training stability and output diversity. In contrast, the Schr枚dinger Bridge (SB), offers a more stable solution by utilizing Optimal Transport (OT) theory to model a stochastic differential equation (SDE) between two arbitrary distributions. This allows SB to effectively transform low-quality retinal images into their high-quality counterparts. In this work, we leverage the SB framework to propose an image-to-image translation pipeline for retinal image enhancement. Additionally, previous methods often fail to capture fine structural details, such as blood vessels. To address this, we enhance our pipeline by introducing Dynamic Snake Convolution, whose tortuous receptive field can better preserve tubular structures. We name the resulting retinal fundus image enhancement framework the Context-aware Unpaired Neural Schr枚dinger Bridge (CUNSB-RFIE). To the best of our knowledge, this is the first endeavor to use the SB approach for retinal image enhancement. Experimental results on a large-scale dataset demonstrate the advantage of the proposed method compared to several state-of-the-art supervised and unsupervised methods in terms of image quality and performance on downstream tasks.The code is available at https://github.com/Retinal-Research/CUNSB-RFIE . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10966v1-abstract-full').style.display = 'none'; document.getElementById('2409.10966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07862">arXiv:2409.07862</a> <span> [<a href="https://arxiv.org/pdf/2409.07862">pdf</a>, <a href="https://arxiv.org/format/2409.07862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Context-Aware Optimal Transport Learning for Retinal Fundus Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vasa%2C+V+K">Vamsi Krishna Vasa</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+P">Peijie Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yujian Xiong</a>, <a href="/search/cs?searchtype=author&query=Dumitrascu%2C+O">Oana Dumitrascu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07862v1-abstract-short" style="display: inline;"> Retinal fundus photography offers a non-invasive way to diagnose and monitor a variety of retinal diseases, but is prone to inherent quality glitches arising from systemic imperfections or operator/patient-related factors. However, high-quality retinal images are crucial for carrying out accurate diagnoses and automated analyses. The fundus image enhancement is typically formulated as a distributi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07862v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07862v1-abstract-full" style="display: none;"> Retinal fundus photography offers a non-invasive way to diagnose and monitor a variety of retinal diseases, but is prone to inherent quality glitches arising from systemic imperfections or operator/patient-related factors. However, high-quality retinal images are crucial for carrying out accurate diagnoses and automated analyses. The fundus image enhancement is typically formulated as a distribution alignment problem, by finding a one-to-one mapping between a low-quality image and its high-quality counterpart. This paper proposes a context-informed optimal transport (OT) learning framework for tackling unpaired fundus image enhancement. In contrast to standard generative image enhancement methods, which struggle with handling contextual information (e.g., over-tampered local structures and unwanted artifacts), the proposed context-aware OT learning paradigm better preserves local structures and minimizes unwanted artifacts. Leveraging deep contextual features, we derive the proposed context-aware OT using the earth mover's distance and show that the proposed context-OT has a solid theoretical guarantee. Experimental results on a large-scale dataset demonstrate the superiority of the proposed method over several state-of-the-art supervised and unsupervised methods in terms of signal-to-noise ratio, structural similarity index, as well as two downstream tasks. The code is available at \url{https://github.com/Retinal-Research/Contextual-OT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07862v1-abstract-full').style.display = 'none'; document.getElementById('2409.07862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03986">arXiv:2409.03986</a> <span> [<a href="https://arxiv.org/pdf/2409.03986">pdf</a>, <a href="https://arxiv.org/format/2409.03986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An Efficient and Generalizable Symbolic Regression Method for Time Series Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yi Xie</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+T">Tianyu Qiu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiuqi Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03986v1-abstract-short" style="display: inline;"> Time series analysis and prediction methods currently excel in quantitative analysis, offering accurate future predictions and diverse statistical indicators, but generally falling short in elucidating the underlying evolution patterns of time series. To gain a more comprehensive understanding and provide insightful explanations, we utilize symbolic regression techniques to derive explicit express… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03986v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03986v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03986v1-abstract-full" style="display: none;"> Time series analysis and prediction methods currently excel in quantitative analysis, offering accurate future predictions and diverse statistical indicators, but generally falling short in elucidating the underlying evolution patterns of time series. To gain a more comprehensive understanding and provide insightful explanations, we utilize symbolic regression techniques to derive explicit expressions for the non-linear dynamics in the evolution of time series variables. However, these techniques face challenges in computational efficiency and generalizability across diverse real-world time series data. To overcome these challenges, we propose \textbf{N}eural-\textbf{E}nhanced \textbf{Mo}nte-Carlo \textbf{T}ree \textbf{S}earch (NEMoTS) for time series. NEMoTS leverages the exploration-exploitation balance of Monte-Carlo Tree Search (MCTS), significantly reducing the search space in symbolic regression and improving expression quality. Furthermore, by integrating neural networks with MCTS, NEMoTS not only capitalizes on their superior fitting capabilities to concentrate on more pertinent operations post-search space reduction, but also replaces the complex and time-consuming simulation process, thereby substantially improving computational efficiency and generalizability in time series analysis. NEMoTS offers an efficient and comprehensive approach to time series analysis. Experiments with three real-world datasets demonstrate NEMoTS's significant superiority in performance, efficiency, reliability, and interpretability, making it well-suited for large-scale real-world time series data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03986v1-abstract-full').style.display = 'none'; document.getElementById('2409.03986v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03561">arXiv:2409.03561</a> <span> [<a href="https://arxiv.org/pdf/2409.03561">pdf</a>, <a href="https://arxiv.org/format/2409.03561">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Communication-Assisted Sensing Systems: Fundamental Limits and ISAC Waveform Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+F">Fuwang Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yifeng Xiong</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuanhao Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03561v1-abstract-short" style="display: inline;"> The communication-assisted sensing (CAS) systems are expected to endow the users with beyond-line-of-sight sensing capabilities without the aid of additional sensors. In this paper, we study the dual-functional signaling strategy, focusing on three primary aspects, namely, the information-theoretic framework, the optimal distribution of channel input, and the optimal waveform design for Gaussian s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03561v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03561v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03561v1-abstract-full" style="display: none;"> The communication-assisted sensing (CAS) systems are expected to endow the users with beyond-line-of-sight sensing capabilities without the aid of additional sensors. In this paper, we study the dual-functional signaling strategy, focusing on three primary aspects, namely, the information-theoretic framework, the optimal distribution of channel input, and the optimal waveform design for Gaussian signals. First, we establish the information-theoretic framework and develop a modified source-channel separation theorem (MSST) tailored for CAS systems. The proposed MSST elucidates the relationship between achievable distortion, coding rate, and communication channel capacity in cases where the distortion metric is separable for sensing and communication (S\&C) processes. Second, we present an optimal channel input design for dual-functional signaling, which aims to minimize total distortion under the constraints of the MSST and resource budget. We then conceive a two-step Blahut-Arimoto (BA)-based optimal search algorithm to numerically solve the functional optimization problem. Third, in light of the current signaling strategy, we further propose an optimal waveform design for Gaussian signaling in multi-input multi-output (MIMO) CAS systems. The associated covariance matrix optimization problem is addressed using a successive convex approximation (SCA)-based waveform design algorithm. Finally, we provide numerical simulation results to demonstrate the effectiveness of the proposed algorithms and to show the unique performance tradeoff between S\&C processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03561v1-abstract-full').style.display = 'none'; document.getElementById('2409.03561v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00740">arXiv:2409.00740</a> <span> [<a href="https://arxiv.org/pdf/2409.00740">pdf</a>, <a href="https://arxiv.org/format/2409.00740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> VPVet: Vetting Privacy Policies of Virtual Reality Apps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yuxia Zhan</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Y">Yan Meng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yichang Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lichuan Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guoxing Chen</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Q">Qingqi Pei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haojin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00740v1-abstract-short" style="display: inline;"> Virtual reality (VR) apps can harvest a wider range of user data than web/mobile apps running on personal computers or smartphones. Existing law and privacy regulations emphasize that VR developers should inform users of what data are collected/used/shared (CUS) through privacy policies. However, privacy policies in the VR ecosystem are still in their early stages, and many developers fail to writ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00740v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00740v1-abstract-full" style="display: none;"> Virtual reality (VR) apps can harvest a wider range of user data than web/mobile apps running on personal computers or smartphones. Existing law and privacy regulations emphasize that VR developers should inform users of what data are collected/used/shared (CUS) through privacy policies. However, privacy policies in the VR ecosystem are still in their early stages, and many developers fail to write appropriate privacy policies that comply with regulations and meet user expectations. In this paper, we propose VPVet to automatically vet privacy policy compliance issues for VR apps. VPVet first analyzes the availability and completeness of a VR privacy policy and then refines its analysis based on three key criteria: granularity, minimization, and consistency of CUS statements. Our study establishes the first and currently largest VR privacy policy dataset named VRPP, consisting of privacy policies of 11,923 different VR apps from 10 mainstream platforms. Our vetting results reveal severe privacy issues within the VR ecosystem, including the limited availability and poor quality of privacy policies, along with their coarse granularity, lack of adaptation to VR traits and the inconsistency between CUS statements in privacy policies and their actual behaviors. We open-source VPVet system along with our findings at repository https://github.com/kalamoo/PPAudit, aiming to raise awareness within the VR community and pave the way for further research in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00740v1-abstract-full').style.display = 'none'; document.getElementById('2409.00740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 13 figures (including subfigures), 13 tables. To appear on ACM CCS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16220">arXiv:2408.16220</a> <span> [<a href="https://arxiv.org/pdf/2408.16220">pdf</a>, <a href="https://arxiv.org/format/2408.16220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> LightSLH: Provable and Low-Overhead Spectre v1 Mitigation through Targeted Instruction Hardening </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiming Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenchao Huang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16220v1-abstract-short" style="display: inline;"> Several software mitigations have been proposed to defend against Spectre vulnerabilities. However, these countermeasures often suffer from high performance overhead, largely due to unnecessary protections. We propose LightSLH, designed to mitigate this overhead by hardening instructions only when they are under threat from Spectre vulnerabilities. LightSLH leverages program analysis techniques ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16220v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16220v1-abstract-full" style="display: none;"> Several software mitigations have been proposed to defend against Spectre vulnerabilities. However, these countermeasures often suffer from high performance overhead, largely due to unnecessary protections. We propose LightSLH, designed to mitigate this overhead by hardening instructions only when they are under threat from Spectre vulnerabilities. LightSLH leverages program analysis techniques based on abstract interpretation to identify all instructions that could potentially lead to Spectre vulnerabilities and provides provable protection. To enhance analysis efficiency and precision, LightSLH employs novel taint and value domains. The taint domain enables bit-level taint tracking, while the value domain allows LightSLH to analyze complex program structures such as pointers and structures. Furthermore, LightSLH uses a two-stage abstract interpretation approach to circumvent potential analysis paralysis issues. We demonstrate the security guarantees of LightSLH and evaluate its performance on cryptographic algorithm implementations from OpenSSL. LightSLH significantly reduces the overhead associated with speculative-load-hardening techniques. Our results show that LightSLH introduces no protection and thus no overhead on 4 out of the 7 studied algorithms, which contrasts with existing countermeasures that introduce additional overhead due to unnecessary hardening. Additionally, LightSLH performs, for the first time, a rigorous analysis of the security guarantees of RSA against Spectre v1, highlighting that the memory access patterns generated by the scatter-gather algorithm depend on secrets, even for observers at the cache line granularity, necessitating protection for such accesses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16220v1-abstract-full').style.display = 'none'; document.getElementById('2408.16220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15997">arXiv:2408.15997</a> <span> [<a href="https://arxiv.org/pdf/2408.15997">pdf</a>, <a href="https://arxiv.org/format/2408.15997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mamba or Transformer for Time Series Forecasting? Mixture of Universals (MoU) Is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sijia Peng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yangyong Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiqiang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15997v1-abstract-short" style="display: inline;"> Time series forecasting requires balancing short-term and long-term dependencies for accurate predictions. Existing methods mainly focus on long-term dependency modeling, neglecting the complexities of short-term dynamics, which may hinder performance. Transformers are superior in modeling long-term dependencies but are criticized for their quadratic computational cost. Mamba provides a near-linea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15997v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15997v1-abstract-full" style="display: none;"> Time series forecasting requires balancing short-term and long-term dependencies for accurate predictions. Existing methods mainly focus on long-term dependency modeling, neglecting the complexities of short-term dynamics, which may hinder performance. Transformers are superior in modeling long-term dependencies but are criticized for their quadratic computational cost. Mamba provides a near-linear alternative but is reported less effective in time series longterm forecasting due to potential information loss. Current architectures fall short in offering both high efficiency and strong performance for long-term dependency modeling. To address these challenges, we introduce Mixture of Universals (MoU), a versatile model to capture both short-term and long-term dependencies for enhancing performance in time series forecasting. MoU is composed of two novel designs: Mixture of Feature Extractors (MoF), an adaptive method designed to improve time series patch representations for short-term dependency, and Mixture of Architectures (MoA), which hierarchically integrates Mamba, FeedForward, Convolution, and Self-Attention architectures in a specialized order to model long-term dependency from a hybrid perspective. The proposed approach achieves state-of-the-art performance while maintaining relatively low computational costs. Extensive experiments on seven real-world datasets demonstrate the superiority of MoU. Code is available at https://github.com/lunaaa95/mou/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15997v1-abstract-full').style.display = 'none'; document.getElementById('2408.15997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code at https://github.com/lunaaa95/mou/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14158">arXiv:2408.14158</a> <span> [<a href="https://arxiv.org/pdf/2408.14158">pdf</a>, <a href="https://arxiv.org/format/2408.14158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fire-Flyer AI-HPC: A Cost-Effective Software-Hardware Co-Design for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+W">Wei An</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shanhuang Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Honghui Ding</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kai Dong</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qiushi Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wenjun Gao</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+K">Kang Guan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jianzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yongqiang Guo</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhe Fu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Ying He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Panpan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiashi Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wenfeng Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shanghao Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuan Lu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+X">Xiaotao Nie</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+T">Tian Pei</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14158v2-abstract-short" style="display: inline;"> The rapid progress in Deep Learning (DL) and Large Language Models (LLMs) has exponentially increased demands of computational power and bandwidth. This, combined with the high costs of faster computing chips and interconnects, has significantly inflated High Performance Computing (HPC) construction costs. To address these challenges, we introduce the Fire-Flyer AI-HPC architecture, a synergistic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14158v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14158v2-abstract-full" style="display: none;"> The rapid progress in Deep Learning (DL) and Large Language Models (LLMs) has exponentially increased demands of computational power and bandwidth. This, combined with the high costs of faster computing chips and interconnects, has significantly inflated High Performance Computing (HPC) construction costs. To address these challenges, we introduce the Fire-Flyer AI-HPC architecture, a synergistic hardware-software co-design framework and its best practices. For DL training, we deployed the Fire-Flyer 2 with 10,000 PCIe A100 GPUs, achieved performance approximating the DGX-A100 while reducing costs by half and energy consumption by 40%. We specifically engineered HFReduce to accelerate allreduce communication and implemented numerous measures to keep our Computation-Storage Integrated Network congestion-free. Through our software stack, including HaiScale, 3FS, and HAI-Platform, we achieved substantial scalability by overlapping computation and communication. Our system-oriented experience from DL training provides valuable insights to drive future advancements in AI-HPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14158v2-abstract-full').style.display = 'none'; document.getElementById('2408.14158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is the preprint version of the paper accepted for presentation at the 2024 International Conference for High Performance Computing, Networking, Storage, and Analysis (SC'24). \c{opyright} 2024 IEEE. Personal use of this material is permitted. For other uses, permission from IEEE must be obtained. Please refer to IEEE Xplore for the final published version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11208">arXiv:2408.11208</a> <span> [<a href="https://arxiv.org/pdf/2408.11208">pdf</a>, <a href="https://arxiv.org/format/2408.11208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PooDLe: Pooled and dense self-supervised learning from naturalistic videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A+N">Alex N. Wang</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+C">Christopher Hoang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuwen Xiong</a>, <a href="/search/cs?searchtype=author&query=LeCun%2C+Y">Yann LeCun</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11208v1-abstract-short" style="display: inline;"> Self-supervised learning has driven significant progress in learning from single-subject, iconic images. However, there are still unanswered questions about the use of minimally-curated, naturalistic video data, which contain dense scenes with many independent objects, imbalanced class distributions, and varying object sizes. In this paper, we propose a novel approach that combines an invariance-b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11208v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11208v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11208v1-abstract-full" style="display: none;"> Self-supervised learning has driven significant progress in learning from single-subject, iconic images. However, there are still unanswered questions about the use of minimally-curated, naturalistic video data, which contain dense scenes with many independent objects, imbalanced class distributions, and varying object sizes. In this paper, we propose a novel approach that combines an invariance-based SSL objective on pooled representations with a dense SSL objective that enforces equivariance to optical flow warping. Our findings indicate that a unified objective applied at multiple feature scales is essential for learning effective image representations from high-resolution, naturalistic videos. We validate our approach on the BDD100K driving video dataset and the Walking Tours first-person video dataset, demonstrating its ability to capture spatial understanding from a dense objective and semantic understanding via a pooled representation objective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11208v1-abstract-full').style.display = 'none'; document.getElementById('2408.11208v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://poodle-ssl.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07369">arXiv:2408.07369</a> <span> [<a href="https://arxiv.org/pdf/2408.07369">pdf</a>, <a href="https://arxiv.org/format/2408.07369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> ProCom: A Few-shot Targeted Community Detection Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xixi Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kaiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxin He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yizhu Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07369v1-abstract-short" style="display: inline;"> Targeted community detection aims to distinguish a particular type of community in the network. This is an important task with a lot of real-world applications, e.g., identifying fraud groups in transaction networks. Traditional community detection methods fail to capture the specific features of the targeted community and detect all types of communities indiscriminately. Semi-supervised community… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07369v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07369v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07369v1-abstract-full" style="display: none;"> Targeted community detection aims to distinguish a particular type of community in the network. This is an important task with a lot of real-world applications, e.g., identifying fraud groups in transaction networks. Traditional community detection methods fail to capture the specific features of the targeted community and detect all types of communities indiscriminately. Semi-supervised community detection algorithms, emerged as a feasible alternative, are inherently constrained by their limited adaptability and substantial reliance on a large amount of labeled data, which demands extensive domain knowledge and manual effort. In this paper, we address the aforementioned weaknesses in targeted community detection by focusing on few-shot scenarios. We propose ProCom, a novel framework that extends the ``pre-train, prompt'' paradigm, offering a low-resource, high-efficiency, and transferable solution. Within the framework, we devise a dual-level context-aware pre-training method that fosters a deep understanding of latent communities in the network, establishing a rich knowledge foundation for downstream task. In the prompt learning stage, we reformulate the targeted community detection task into pre-training objectives, allowing the extraction of specific knowledge relevant to the targeted community to facilitate effective and efficient inference. By leveraging both the general community knowledge acquired during pre-training and the specific insights gained from the prompt communities, ProCom exhibits remarkable adaptability across different datasets. We conduct extensive experiments on five benchmarks to evaluate the ProCom framework, demonstrating its SOTA performance under few-shot scenarios, strong efficiency, and transferability across diverse datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07369v1-abstract-full').style.display = 'none'; document.getElementById('2408.07369v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGKDD'2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06854">arXiv:2408.06854</a> <span> [<a href="https://arxiv.org/pdf/2408.06854">pdf</a>, <a href="https://arxiv.org/format/2408.06854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LoRA$^2$ : Multi-Scale Low-Rank Approximations for Fine-Tuning Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia-Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yu-Jie Xiong</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+H">He-Xi Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dong-Hai Zhu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+C">Chun-Ming Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06854v1-abstract-short" style="display: inline;"> Fine-tuning large language models (LLMs) with high parameter efficiency for downstream tasks has become a new paradigm. Low-Rank Adaptation (LoRA) significantly reduces the number of trainable parameters for fine-tuning. Although it has demonstrated commendable performance, updating parameters within a single scale may not be the optimal choice for complex downstream tasks.In this paper, we extend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06854v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06854v1-abstract-full" style="display: none;"> Fine-tuning large language models (LLMs) with high parameter efficiency for downstream tasks has become a new paradigm. Low-Rank Adaptation (LoRA) significantly reduces the number of trainable parameters for fine-tuning. Although it has demonstrated commendable performance, updating parameters within a single scale may not be the optimal choice for complex downstream tasks.In this paper, we extend the LoRA to multiple scales, dubbed as LoRA$^2$. We first combine orthogonal projection theory to train a set of LoRAs in two mutually orthogonal planes. Then, we improve the importance score algorithm, which reduce parameter sensitivity score calculations by approximately 98.5\%. By pruning singular values with lower importance scores, thereby enhancing adaptability to various downstream tasks. Extensive experiments are conducted on two widely used pre-trained models to validate the effectiveness of LoRA$^2$. Results show that it significantly reduces the number of trainable parameters to just 0.72\% compared to full fine-tuning, while still delivering highly impressive performance. Even when the parameters are further reduced to 0.17M, it still achieves comparable results to the baseline with 8 times more parameters. Our code is available here: https://anonymous.4open.science/r/LoRA-2-5B4C <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06854v1-abstract-full').style.display = 'none'; document.getElementById('2408.06854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21059">arXiv:2407.21059</a> <span> [<a href="https://arxiv.org/pdf/2407.21059">pdf</a>, <a href="https://arxiv.org/format/2407.21059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Modular RAG: Transforming RAG Systems into LEGO-like Reconfigurable Frameworks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yunfan Gao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21059v1-abstract-short" style="display: inline;"> Retrieval-augmented Generation (RAG) has markedly enhanced the capabilities of Large Language Models (LLMs) in tackling knowledge-intensive tasks. The increasing demands of application scenarios have driven the evolution of RAG, leading to the integration of advanced retrievers, LLMs and other complementary technologies, which in turn has amplified the intricacy of RAG systems. However, the rapid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21059v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21059v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21059v1-abstract-full" style="display: none;"> Retrieval-augmented Generation (RAG) has markedly enhanced the capabilities of Large Language Models (LLMs) in tackling knowledge-intensive tasks. The increasing demands of application scenarios have driven the evolution of RAG, leading to the integration of advanced retrievers, LLMs and other complementary technologies, which in turn has amplified the intricacy of RAG systems. However, the rapid advancements are outpacing the foundational RAG paradigm, with many methods struggling to be unified under the process of "retrieve-then-generate". In this context, this paper examines the limitations of the existing RAG paradigm and introduces the modular RAG framework. By decomposing complex RAG systems into independent modules and specialized operators, it facilitates a highly reconfigurable framework. Modular RAG transcends the traditional linear architecture, embracing a more advanced design that integrates routing, scheduling, and fusion mechanisms. Drawing on extensive research, this paper further identifies prevalent RAG patterns-linear, conditional, branching, and looping-and offers a comprehensive analysis of their respective implementation nuances. Modular RAG presents innovative opportunities for the conceptualization and deployment of RAG systems. Finally, the paper explores the potential emergence of new operators and paradigms, establishing a solid theoretical foundation and a practical roadmap for the continued evolution and practical deployment of RAG technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21059v1-abstract-full').style.display = 'none'; document.getElementById('2407.21059v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19826">arXiv:2407.19826</a> <span> [<a href="https://arxiv.org/pdf/2407.19826">pdf</a>, <a href="https://arxiv.org/format/2407.19826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Design and Control of a Novel Six-Degree-of-Freedom Hybrid Robotic Arm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Z">Zhonghua Miao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yuanyue Ge</a>, <a href="/search/cs?searchtype=author&query=lin%2C+S">Sen lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liping Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Ya Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19826v1-abstract-short" style="display: inline;"> Robotic arms are key components in fruit-harvesting robots. In agricultural settings, conventional serial or parallel robotic arms often fall short in meeting the demands for a large workspace, rapid movement, enhanced capability of obstacle avoidance and affordability. This study proposes a novel hybrid six-degree-of-freedom (DoF) robotic arm that combines the advantages of parallel and serial me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19826v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19826v1-abstract-full" style="display: none;"> Robotic arms are key components in fruit-harvesting robots. In agricultural settings, conventional serial or parallel robotic arms often fall short in meeting the demands for a large workspace, rapid movement, enhanced capability of obstacle avoidance and affordability. This study proposes a novel hybrid six-degree-of-freedom (DoF) robotic arm that combines the advantages of parallel and serial mechanisms. Inspired by yoga, we designed two sliders capable of moving independently along a single rail, acting as two feet. These sliders are interconnected with linkages and a meshed-gear set, allowing the parallel mechanism to lower itself and perform a split to pass under obstacles. This unique feature allows the arm to avoid obstacles such as pipes, tables and beams typically found in greenhouses. Integrated with serially mounted joints, the patented hybrid arm is able to maintain the end's pose even when it moves with a mobile platform, facilitating fruit picking with the optimal pose in dynamic conditions. Moreover, the hybrid arm's workspace is substantially larger, being almost three times the volume of UR3 serial arms and fourteen times that of the ABB IRB parallel arms. Experiments show that the repeatability errors are 0.017 mm, 0.03 mm and 0.109 mm for the two sliders and the arm's end, respectively, providing sufficient precision for agricultural robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19826v1-abstract-full').style.display = 'none'; document.getElementById('2407.19826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IROS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18877">arXiv:2407.18877</a> <span> [<a href="https://arxiv.org/pdf/2407.18877">pdf</a>, <a href="https://arxiv.org/format/2407.18877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Line-level Semantic Structure Learning for Code Vulnerability Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziliang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yingfei Xiong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18877v2-abstract-short" style="display: inline;"> Unlike the flow structure of natural languages, programming languages have an inherent rigidity in structure and grammar.However, existing detection methods based on pre-trained models typically treat code as a natural language sequence, ignoring its unique structural information. This hinders the models from understanding the code's semantic and structual information.To address this problem, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18877v2-abstract-full').style.display = 'inline'; document.getElementById('2407.18877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18877v2-abstract-full" style="display: none;"> Unlike the flow structure of natural languages, programming languages have an inherent rigidity in structure and grammar.However, existing detection methods based on pre-trained models typically treat code as a natural language sequence, ignoring its unique structural information. This hinders the models from understanding the code's semantic and structual information.To address this problem, we introduce the Code Structure-Aware Network through Line-level Semantic Learning (CSLS), which comprises four components: code preprocessing, global semantic awareness, line semantic awareness, and line semantic structure awareness.The preprocessing step transforms the code into two types of text: global code text and line-level code text.Unlike typical preprocessing methods, CSLS retains structural elements such as newlines and indent characters to enhance the model's perception of code lines during global semantic awareness.For line semantics structure awareness, the CSLS network emphasizes capturing structural relationships between line semantics.Different from the structural modeling methods based on code blocks (control flow graphs) or tokens, CSLS uses line semantics as the minimum structural unit to learn nonlinear structural relationships, thereby improving the accuracy of code vulnerability detection.We conducted extensive experiments on vulnerability detection datasets from real projects. The CSLS model outperforms the state-of-the-art baselines in code vulnerability detection, achieving 70.57% accuracy on the Devign dataset and a 49.59% F1 score on the Reveal dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18877v2-abstract-full').style.display = 'none'; document.getElementById('2407.18877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18523">arXiv:2407.18523</a> <span> [<a href="https://arxiv.org/pdf/2407.18523">pdf</a>, <a href="https://arxiv.org/format/2407.18523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DTFormer: A Transformer-Based Method for Discrete-Time Dynamic Graph Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shiyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xixi Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tengfei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18523v1-abstract-short" style="display: inline;"> Discrete-Time Dynamic Graphs (DTDGs), which are prevalent in real-world implementations and notable for their ease of data acquisition, have garnered considerable attention from both academic researchers and industry practitioners. The representation learning of DTDGs has been extensively applied to model the dynamics of temporally changing entities and their evolving connections. Currently, DTDG… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18523v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18523v1-abstract-full" style="display: none;"> Discrete-Time Dynamic Graphs (DTDGs), which are prevalent in real-world implementations and notable for their ease of data acquisition, have garnered considerable attention from both academic researchers and industry practitioners. The representation learning of DTDGs has been extensively applied to model the dynamics of temporally changing entities and their evolving connections. Currently, DTDG representation learning predominantly relies on GNN+RNN architectures, which manifest the inherent limitations of both Graph Neural Networks (GNNs) and Recurrent Neural Networks (RNNs). GNNs suffer from the over-smoothing issue as the models architecture goes deeper, while RNNs struggle to capture long-term dependencies effectively. GNN+RNN architectures also grapple with scaling to large graph sizes and long sequences. Additionally, these methods often compute node representations separately and focus solely on individual node characteristics, thereby overlooking the behavior intersections between the two nodes whose link is being predicted, such as instances where the two nodes appear together in the same context or share common neighbors. This paper introduces a novel representation learning method DTFormer for DTDGs, pivoting from the traditional GNN+RNN framework to a Transformer-based architecture. Our approach exploits the attention mechanism to concurrently process topological information within the graph at each timestamp and temporal dynamics of graphs along the timestamps, circumventing the aforementioned fundamental weakness of both GNNs and RNNs. Moreover, we enhance the model's expressive capability by incorporating the intersection relationships among nodes and integrating a multi-patching module. Extensive experiments conducted on six public dynamic graph benchmark datasets confirm our model's efficacy, achieving the SOTA performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18523v1-abstract-full').style.display = 'none'; document.getElementById('2407.18523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16308">arXiv:2407.16308</a> <span> [<a href="https://arxiv.org/pdf/2407.16308">pdf</a>, <a href="https://arxiv.org/format/2407.16308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SAFNet: Selective Alignment Fusion Network for Efficient HDR Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingtong Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yike Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hong Gu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinwei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16308v1-abstract-short" style="display: inline;"> Multi-exposure High Dynamic Range (HDR) imaging is a challenging task when facing truncated texture and complex motion. Existing deep learning-based methods have achieved great success by either following the alignment and fusion pipeline or utilizing attention mechanism. However, the large computation cost and inference delay hinder them from deploying on resource limited devices. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16308v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16308v1-abstract-full" style="display: none;"> Multi-exposure High Dynamic Range (HDR) imaging is a challenging task when facing truncated texture and complex motion. Existing deep learning-based methods have achieved great success by either following the alignment and fusion pipeline or utilizing attention mechanism. However, the large computation cost and inference delay hinder them from deploying on resource limited devices. In this paper, to achieve better efficiency, a novel Selective Alignment Fusion Network (SAFNet) for HDR imaging is proposed. After extracting pyramid features, it jointly refines valuable area masks and cross-exposure motion in selected regions with shared decoders, and then fuses high quality HDR image in an explicit way. This approach can focus the model on finding valuable regions while estimating their easily detectable and meaningful motion. For further detail enhancement, a lightweight refine module is introduced which enjoys privileges from previous optical flow, selection masks and initial prediction. Moreover, to facilitate learning on samples with large motion, a new window partition cropping method is presented during training. Experiments on public and newly developed challenging datasets show that proposed SAFNet not only exceeds previous SOTA competitors quantitatively and qualitatively, but also runs order of magnitude faster. Code and dataset is available at https://github.com/ltkong218/SAFNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16308v1-abstract-full').style.display = 'none'; document.getElementById('2407.16308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14053">arXiv:2407.14053</a> <span> [<a href="https://arxiv.org/pdf/2407.14053">pdf</a>, <a href="https://arxiv.org/format/2407.14053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DirectL: Efficient Radiance Fields Rendering for 3D Light Field Displays </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Baolin Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yingde Song</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yongping Xiong</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Lan Yi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaohe Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xunbo Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14053v1-abstract-short" style="display: inline;"> Autostereoscopic display, despite decades of development, has not achieved extensive application, primarily due to the daunting challenge of 3D content creation for non-specialists. The emergence of Radiance Field as an innovative 3D representation has markedly revolutionized the domains of 3D reconstruction and generation. This technology greatly simplifies 3D content creation for common users, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14053v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14053v1-abstract-full" style="display: none;"> Autostereoscopic display, despite decades of development, has not achieved extensive application, primarily due to the daunting challenge of 3D content creation for non-specialists. The emergence of Radiance Field as an innovative 3D representation has markedly revolutionized the domains of 3D reconstruction and generation. This technology greatly simplifies 3D content creation for common users, broadening the applicability of Light Field Displays (LFDs). However, the combination of these two fields remains largely unexplored. The standard paradigm to create optimal content for parallax-based light field displays demands rendering at least 45 slightly shifted views preferably at high resolution per frame, a substantial hurdle for real-time rendering. We introduce DirectL, a novel rendering paradigm for Radiance Fields on 3D displays. We thoroughly analyze the interweaved mapping of spatial rays to screen subpixels, precisely determine the light rays entering the human eye, and propose subpixel repurposing to significantly reduce the pixel count required for rendering. Tailored for the two predominant radiance fields--Neural Radiance Fields (NeRFs) and 3D Gaussian Splatting (3DGS), we propose corresponding optimized rendering pipelines that directly render the light field images instead of multi-view images. Extensive experiments across various displays and user study demonstrate that DirectL accelerates rendering by up to 40 times compared to the standard paradigm without sacrificing visual quality. Its rendering process-only modification allows seamless integration into subsequent radiance field tasks. Finally, we integrate DirectL into diverse applications, showcasing the stunning visual experiences and the synergy between LFDs and Radiance Fields, which unveils tremendous potential for commercialization applications. \href{direct-l.github.io}{\textbf{Project Homepage} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14053v1-abstract-full').style.display = 'none'; document.getElementById('2407.14053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13976">arXiv:2407.13976</a> <span> [<a href="https://arxiv.org/pdf/2407.13976">pdf</a>, <a href="https://arxiv.org/format/2407.13976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PlacidDreamer: Advancing Harmony in Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuo Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shikun Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+X">Xiaoyu Qin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yanmin Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jia Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13976v1-abstract-short" style="display: inline;"> Recently, text-to-3D generation has attracted significant attention, resulting in notable performance enhancements. Previous methods utilize end-to-end 3D generation models to initialize 3D Gaussians, multi-view diffusion models to enforce multi-view consistency, and text-to-image diffusion models to refine details with score distillation algorithms. However, these methods exhibit two limitations.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13976v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13976v1-abstract-full" style="display: none;"> Recently, text-to-3D generation has attracted significant attention, resulting in notable performance enhancements. Previous methods utilize end-to-end 3D generation models to initialize 3D Gaussians, multi-view diffusion models to enforce multi-view consistency, and text-to-image diffusion models to refine details with score distillation algorithms. However, these methods exhibit two limitations. Firstly, they encounter conflicts in generation directions since different models aim to produce diverse 3D assets. Secondly, the issue of over-saturation in score distillation has not been thoroughly investigated and solved. To address these limitations, we propose PlacidDreamer, a text-to-3D framework that harmonizes initialization, multi-view generation, and text-conditioned generation with a single multi-view diffusion model, while simultaneously employing a novel score distillation algorithm to achieve balanced saturation. To unify the generation direction, we introduce the Latent-Plane module, a training-friendly plug-in extension that enables multi-view diffusion models to provide fast geometry reconstruction for initialization and enhanced multi-view images to personalize the text-to-image diffusion model. To address the over-saturation problem, we propose to view score distillation as a multi-objective optimization problem and introduce the Balanced Score Distillation algorithm, which offers a Pareto Optimal solution that achieves both rich details and balanced saturation. Extensive experiments validate the outstanding capabilities of our PlacidDreamer. The code is available at \url{https://github.com/HansenHuang0823/PlacidDreamer}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13976v1-abstract-full').style.display = 'none'; document.getElementById('2407.13976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13193">arXiv:2407.13193</a> <span> [<a href="https://arxiv.org/pdf/2407.13193">pdf</a>, <a href="https://arxiv.org/format/2407.13193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Augmented Generation for Natural Language Processing: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangyu Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Ying Xiong</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yufei Cui</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haolun Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Can Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianming Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xue Liu</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+T">Tei-Wei Kuo</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+N">Nan Guan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+C+J">Chun Jason Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13193v2-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated great success in various fields, benefiting from their huge amount of parameters that store knowledge. However, LLMs still suffer from several key issues, such as hallucination problems, knowledge update issues, and lacking domain-specific expertise. The appearance of retrieval-augmented generation (RAG), which leverages an external knowledge database… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13193v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13193v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13193v2-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated great success in various fields, benefiting from their huge amount of parameters that store knowledge. However, LLMs still suffer from several key issues, such as hallucination problems, knowledge update issues, and lacking domain-specific expertise. The appearance of retrieval-augmented generation (RAG), which leverages an external knowledge database to augment LLMs, makes up those drawbacks of LLMs. This paper reviews all significant techniques of RAG, especially in the retriever and the retrieval fusions. Besides, tutorial codes are provided for implementing the representative techniques in RAG. This paper further discusses the RAG training, including RAG with/without datastore update. Then, we introduce the application of RAG in representative natural language processing tasks and industrial scenarios. Finally, this paper discusses the future directions and challenges of RAG for promoting its development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13193v2-abstract-full').style.display = 'none'; document.getElementById('2407.13193v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13168">arXiv:2407.13168</a> <span> [<a href="https://arxiv.org/pdf/2407.13168">pdf</a>, <a href="https://arxiv.org/format/2407.13168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SciCode: A Research Coding Benchmark Curated by Scientists </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+M">Minyang Tian</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Luyu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S+D">Shizhuo Dylan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinan Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Cunwei Fan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xuefei Guo</a>, <a href="/search/cs?searchtype=author&query=Haas%2C+R">Roland Haas</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+P">Pan Ji</a>, <a href="/search/cs?searchtype=author&query=Krongchon%2C+K">Kittithat Krongchon</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengyan Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Di Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yutao Ma</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hao Tong</a>, <a href="/search/cs?searchtype=author&query=Trinh%2C+K">Kha Trinh</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chenyu Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohao Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yanyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengzhu Yin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Lieret%2C+K">Kilian Lieret</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yanxin Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Genglin Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yufeng Du</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13168v1-abstract-short" style="display: inline;"> Since language models (LMs) now outperform average humans on many challenging tasks, it has become increasingly difficult to develop challenging, high-quality, and realistic evaluations. We address this issue by examining LMs' capabilities to generate code for solving real scientific research problems. Incorporating input from scientists and AI researchers in 16 diverse natural science sub-fields,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13168v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13168v1-abstract-full" style="display: none;"> Since language models (LMs) now outperform average humans on many challenging tasks, it has become increasingly difficult to develop challenging, high-quality, and realistic evaluations. We address this issue by examining LMs' capabilities to generate code for solving real scientific research problems. Incorporating input from scientists and AI researchers in 16 diverse natural science sub-fields, including mathematics, physics, chemistry, biology, and materials science, we created a scientist-curated coding benchmark, SciCode. The problems in SciCode naturally factorize into multiple subproblems, each involving knowledge recall, reasoning, and code synthesis. In total, SciCode contains 338 subproblems decomposed from 80 challenging main problems. It offers optional descriptions specifying useful scientific background information and scientist-annotated gold-standard solutions and test cases for evaluation. Claude3.5-Sonnet, the best-performing model among those tested, can solve only 4.6% of the problems in the most realistic setting. We believe that SciCode demonstrates both contemporary LMs' progress towards becoming helpful scientific assistants and sheds light on the development and evaluation of scientific AI in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13168v1-abstract-full').style.display = 'none'; document.getElementById('2407.13168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 9 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12532">arXiv:2407.12532</a> <span> [<a href="https://arxiv.org/pdf/2407.12532">pdf</a>, <a href="https://arxiv.org/format/2407.12532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Collaborative Intelligence: Propagating Intentions and Reasoning for Multi-Agent Coordination with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xihe Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xiaoyu Tan</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+C">Chao Qu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yujie Xiong</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinghui Xu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+W">Wei Chu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12532v1-abstract-short" style="display: inline;"> Effective collaboration in multi-agent systems requires communicating goals and intentions between agents. Current agent frameworks often suffer from dependencies on single-agent execution and lack robust inter-module communication, frequently leading to suboptimal multi-agent reinforcement learning (MARL) policies and inadequate task coordination. To address these challenges, we present a framewo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12532v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12532v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12532v1-abstract-full" style="display: none;"> Effective collaboration in multi-agent systems requires communicating goals and intentions between agents. Current agent frameworks often suffer from dependencies on single-agent execution and lack robust inter-module communication, frequently leading to suboptimal multi-agent reinforcement learning (MARL) policies and inadequate task coordination. To address these challenges, we present a framework for training large language models (LLMs) as collaborative agents to enable coordinated behaviors in cooperative MARL. Each agent maintains a private intention consisting of its current goal and associated sub-tasks. Agents broadcast their intentions periodically, allowing other agents to infer coordination tasks. A propagation network transforms broadcast intentions into teammate-specific communication messages, sharing relevant goals with designated teammates. The architecture of our framework is structured into planning, grounding, and execution modules. During execution, multiple agents interact in a downstream environment and communicate intentions to enable coordinated behaviors. The grounding module dynamically adapts comprehension strategies based on emerging coordination patterns, while feedback from execution agents influnces the planning module, enabling the dynamic re-planning of sub-tasks. Results in collaborative environment simulation demonstrate intention propagation reduces miscoordination errors by aligning sub-task dependencies between agents. Agents learn when to communicate intentions and which teammates require task details, resulting in emergent coordinated behaviors. This demonstrates the efficacy of intention sharing for cooperative multi-agent RL based on LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12532v1-abstract-full').style.display = 'none'; document.getElementById('2407.12532v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10195">arXiv:2407.10195</a> <span> [<a href="https://arxiv.org/pdf/2407.10195">pdf</a>, <a href="https://arxiv.org/format/2407.10195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> V2I-Calib: A Novel Calibration Approach for Collaborative Vehicle and Infrastructure LiDAR Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qianxin Qu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yijin Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xin Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaohan Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanyu Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shichun Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guoying Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10195v2-abstract-short" style="display: inline;"> Cooperative LiDAR systems integrating vehicles and road infrastructure, termed V2I calibration, exhibit substantial potential, yet their deployment encounters numerous challenges. A pivotal aspect of ensuring data accuracy and consistency across such systems involves the calibration of LiDAR units across heterogeneous vehicular and infrastructural endpoints. This necessitates the development of ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10195v2-abstract-full').style.display = 'inline'; document.getElementById('2407.10195v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10195v2-abstract-full" style="display: none;"> Cooperative LiDAR systems integrating vehicles and road infrastructure, termed V2I calibration, exhibit substantial potential, yet their deployment encounters numerous challenges. A pivotal aspect of ensuring data accuracy and consistency across such systems involves the calibration of LiDAR units across heterogeneous vehicular and infrastructural endpoints. This necessitates the development of calibration methods that are both real-time and robust, particularly those that can ensure robust performance in urban canyon scenarios without relying on initial positioning values. Accordingly, this paper introduces a novel approach to V2I calibration, leveraging spatial association information among perceived objects. Central to this method is the innovative Overall Intersection over Union (oIoU) metric, which quantifies the correlation between targets identified by vehicle and infrastructure systems, thereby facilitating the real-time monitoring of calibration results. Our approach involves identifying common targets within the perception results of vehicle and infrastructure LiDAR systems through the construction of an affinity matrix. These common targets then form the basis for the calculation and optimization of extrinsic parameters. Comparative and ablation studies conducted using the DAIR-V2X dataset substantiate the superiority of our approach. For further insights and resources, our project repository is accessible at https://github.com/MassimoQu/v2i-calib. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10195v2-abstract-full').style.display = 'none'; document.getElementById('2407.10195v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IROS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09816">arXiv:2407.09816</a> <span> [<a href="https://arxiv.org/pdf/2407.09816">pdf</a>, <a href="https://arxiv.org/format/2407.09816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MaskMoE: Boosting Token-Level Learning via Routing Mask in Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhenpeng Su</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xue Bai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xing Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+H">Haoran Lian</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guangyuan Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wei Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songlin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09816v4-abstract-short" style="display: inline;"> Scaling the size of a model enhances its capabilities but significantly increases computation complexity. Mixture-of-Experts models (MoE) address the issue by allowing model size to scale up without substantially increasing training or inference costs. In MoE, there is an important module called the router, which is used to distribute each token to the experts. Currently, the mainstream routing me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09816v4-abstract-full').style.display = 'inline'; document.getElementById('2407.09816v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09816v4-abstract-full" style="display: none;"> Scaling the size of a model enhances its capabilities but significantly increases computation complexity. Mixture-of-Experts models (MoE) address the issue by allowing model size to scale up without substantially increasing training or inference costs. In MoE, there is an important module called the router, which is used to distribute each token to the experts. Currently, the mainstream routing methods include dynamic routing and fixed routing. Despite their promising results, MoE models encounter several challenges. Primarily, for dynamic routing methods, the dispersion of training tokens across multiple experts can lead to underfitting, particularly for infrequent tokens. Additionally, though fixed routing methods can mitigate that issue, they compromise on the diversity of representations. In this paper, we propose \textbf{MaskMoE}, a method designed to enhance token-level learning by employing a routing \textbf{mask}ing technique within the \textbf{M}ixture-\textbf{o}f-\textbf{E}xperts model. MaskMoE is capable of maintaining representation diversity while achieving more comprehensive training. Experimental results demonstrate that our method outperforms previous dominant Mixture-of-Experts models in terms of both perplexity (PPL) and downstream task performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09816v4-abstract-full').style.display = 'none'; document.getElementById('2407.09816v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06691">arXiv:2407.06691</a> <span> [<a href="https://arxiv.org/pdf/2407.06691">pdf</a>, <a href="https://arxiv.org/ps/2407.06691">ps</a>, <a href="https://arxiv.org/format/2407.06691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> OFDM Achieves the Lowest Ranging Sidelobe Under Random ISAC Signaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yifeng Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuangyang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weijie Yuan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Feifei Gao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/cs?searchtype=author&query=Caire%2C+G">Giuseppe Caire</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06691v2-abstract-short" style="display: inline;"> This paper aims to answer a fundamental question in the area of Integrated Sensing and Communications (ISAC): What is the optimal communication-centric ISAC waveform for ranging? Towards that end, we first established a generic framework to analyze the sensing performance of communication-centric ISAC waveforms built upon orthonormal signaling bases and random data symbols. Then, we evaluated thei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06691v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06691v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06691v2-abstract-full" style="display: none;"> This paper aims to answer a fundamental question in the area of Integrated Sensing and Communications (ISAC): What is the optimal communication-centric ISAC waveform for ranging? Towards that end, we first established a generic framework to analyze the sensing performance of communication-centric ISAC waveforms built upon orthonormal signaling bases and random data symbols. Then, we evaluated their ranging performance by adopting both the periodic and aperiodic auto-correlation functions (P-ACF and A-ACF), and defined the expectation of the integrated sidelobe level (EISL) as a sensing performance metric. On top of that, we proved that among all communication waveforms with cyclic prefix (CP), the orthogonal frequency division multiplexing (OFDM) modulation is the only globally optimal waveform that achieves the lowest ranging sidelobe for quadrature amplitude modulation (QAM) and phase shift keying (PSK) constellations, in terms of both the EISL and the sidelobe level at each individual lag of the P-ACF. As a step forward, we proved that among all communication waveforms without CP, OFDM is a locally optimal waveform for QAM/PSK in the sense that it achieves a local minimum of the EISL of the A-ACF. Finally, we demonstrated by numerical results that under QAM/PSK constellations, there is no other orthogonal communication-centric waveform that achieves a lower ranging sidelobe level than that of the OFDM, in terms of both P-ACF and A-ACF cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06691v2-abstract-full').style.display = 'none'; document.getElementById('2407.06691v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 11 figures, submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06358">arXiv:2407.06358</a> <span> [<a href="https://arxiv.org/pdf/2407.06358">pdf</a>, <a href="https://arxiv.org/format/2407.06358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MiraData: A Large-Scale Video Dataset with Long Durations and Structured Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+X">Xuan Ju</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yiming Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Ziyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Ailing Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yu Xiong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06358v1-abstract-short" style="display: inline;"> Sora's high-motion intensity and long consistent videos have significantly impacted the field of video generation, attracting unprecedented attention. However, existing publicly available datasets are inadequate for generating Sora-like videos, as they mainly contain short videos with low motion intensity and brief captions. To address these issues, we propose MiraData, a high-quality video datase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06358v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06358v1-abstract-full" style="display: none;"> Sora's high-motion intensity and long consistent videos have significantly impacted the field of video generation, attracting unprecedented attention. However, existing publicly available datasets are inadequate for generating Sora-like videos, as they mainly contain short videos with low motion intensity and brief captions. To address these issues, we propose MiraData, a high-quality video dataset that surpasses previous ones in video duration, caption detail, motion strength, and visual quality. We curate MiraData from diverse, manually selected sources and meticulously process the data to obtain semantically consistent clips. GPT-4V is employed to annotate structured captions, providing detailed descriptions from four different perspectives along with a summarized dense caption. To better assess temporal consistency and motion intensity in video generation, we introduce MiraBench, which enhances existing benchmarks by adding 3D consistency and tracking-based motion strength metrics. MiraBench includes 150 evaluation prompts and 17 metrics covering temporal consistency, motion strength, 3D consistency, visual quality, text-video alignment, and distribution similarity. To demonstrate the utility and effectiveness of MiraData, we conduct experiments using our DiT-based video generation model, MiraDiT. The experimental results on MiraBench demonstrate the superiority of MiraData, especially in motion strength. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06358v1-abstract-full').style.display = 'none'; document.getElementById('2407.06358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18485">arXiv:2406.18485</a> <span> [<a href="https://arxiv.org/pdf/2406.18485">pdf</a>, <a href="https://arxiv.org/format/2406.18485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> LoongTrain: Efficient Training of Long-Sequence LLMs with Head-Context Parallelism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+D">Diandian Gu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peng Sun</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghao Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Ting Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xun Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yingtong Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoteng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiaoling Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shangchun Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jiarui Fang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yonggang Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xin Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuanzhe Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18485v1-abstract-short" style="display: inline;"> Efficiently training LLMs with long sequences is important yet challenged by the massive computation and memory requirements. Sequence parallelism has been proposed to tackle these problems, but existing methods suffer from scalability or efficiency issues. We propose LoongTrain, a novel system to efficiently train LLMs with long sequences at scale. The core of LoongTrain is the 2D-Attention mecha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18485v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18485v1-abstract-full" style="display: none;"> Efficiently training LLMs with long sequences is important yet challenged by the massive computation and memory requirements. Sequence parallelism has been proposed to tackle these problems, but existing methods suffer from scalability or efficiency issues. We propose LoongTrain, a novel system to efficiently train LLMs with long sequences at scale. The core of LoongTrain is the 2D-Attention mechanism, which combines both head-parallel and context-parallel techniques to break the scalability constraints while maintaining efficiency. We introduce Double-Ring-Attention and analyze the performance of device placement strategies to further speed up training. We implement LoongTrain with the hybrid ZeRO and Selective Checkpoint++ techniques. Experiment results show that LoongTrain outperforms state-of-the-art baselines, i.e., DeepSpeed-Ulysses and Megatron Context Parallelism, in both end-to-end training speed and scalability, and improves Model FLOPs Utilization (MFU) by up to 2.88x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18485v1-abstract-full').style.display = 'none'; document.getElementById('2406.18485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11891">arXiv:2406.11891</a> <span> [<a href="https://arxiv.org/pdf/2406.11891">pdf</a>, <a href="https://arxiv.org/format/2406.11891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Adaptive Neighborhood for Advancing Temporal Interaction Graph Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yun Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xixi Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yongrui Fu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yinglong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11891v1-abstract-short" style="display: inline;"> Temporal Graph Networks (TGNs) have demonstrated their remarkable performance in modeling temporal interaction graphs. These works can generate temporal node representations by encoding the surrounding neighborhoods for the target node. However, an inherent limitation of existing TGNs is their reliance on fixed, hand-crafted rules for neighborhood encoding, overlooking the necessity for an adaptiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11891v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11891v1-abstract-full" style="display: none;"> Temporal Graph Networks (TGNs) have demonstrated their remarkable performance in modeling temporal interaction graphs. These works can generate temporal node representations by encoding the surrounding neighborhoods for the target node. However, an inherent limitation of existing TGNs is their reliance on fixed, hand-crafted rules for neighborhood encoding, overlooking the necessity for an adaptive and learnable neighborhood that can accommodate both personalization and temporal evolution across different timestamps. In this paper, we aim to enhance existing TGNs by introducing an adaptive neighborhood encoding mechanism. We present SEAN, a flexible plug-and-play model that can be seamlessly integrated with existing TGNs, effectively boosting their performance. To achieve this, we decompose the adaptive neighborhood encoding process into two phases: (i) representative neighbor selection, and (ii) temporal-aware neighborhood information aggregation. Specifically, we propose the Representative Neighbor Selector component, which automatically pinpoints the most important neighbors for the target node. It offers a tailored understanding of each node's unique surrounding context, facilitating personalization. Subsequently, we propose a Temporal-aware Aggregator, which synthesizes neighborhood aggregation by selectively determining the utilization of aggregation routes and decaying the outdated information, allowing our model to adaptively leverage both the contextually significant and current information during aggregation. We conduct extensive experiments by integrating SEAN into three representative TGNs, evaluating their performance on four public datasets and one financial benchmark dataset introduced in this paper. The results demonstrate that SEAN consistently leads to performance improvements across all models, achieving SOTA performance and exceptional robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11891v1-abstract-full').style.display = 'none'; document.getElementById('2406.11891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">KDD'2024 Research Track Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11836">arXiv:2406.11836</a> <span> [<a href="https://arxiv.org/pdf/2406.11836">pdf</a>, <a href="https://arxiv.org/format/2406.11836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> RetinaGS: Scalable Training for Dense Scene Rendering with Billion-Scale 3D Gaussians </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingling Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luchao Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+K">Kaimin Liao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Sijie Yan</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuanjun Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11836v2-abstract-short" style="display: inline;"> In this work, we explore the possibility of training high-parameter 3D Gaussian splatting (3DGS) models on large-scale, high-resolution datasets. We design a general model parallel training method for 3DGS, named RetinaGS, which uses a proper rendering equation and can be applied to any scene and arbitrary distribution of Gaussian primitives. It enables us to explore the scaling behavior of 3DGS i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11836v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11836v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11836v2-abstract-full" style="display: none;"> In this work, we explore the possibility of training high-parameter 3D Gaussian splatting (3DGS) models on large-scale, high-resolution datasets. We design a general model parallel training method for 3DGS, named RetinaGS, which uses a proper rendering equation and can be applied to any scene and arbitrary distribution of Gaussian primitives. It enables us to explore the scaling behavior of 3DGS in terms of primitive numbers and training resolutions that were difficult to explore before and surpass previous state-of-the-art reconstruction quality. We observe a clear positive trend of increasing visual quality when increasing primitive numbers with our method. We also demonstrate the first attempt at training a 3DGS model with more than one billion primitives on the full MatrixCity dataset that attains a promising visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11836v2-abstract-full').style.display = 'none'; document.getElementById('2406.11836v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11833">arXiv:2406.11833</a> <span> [<a href="https://arxiv.org/pdf/2406.11833">pdf</a>, <a href="https://arxiv.org/format/2406.11833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MMDU: A Multi-Turn Multi-Image Dialog Understanding Benchmark and Instruction-Tuning Dataset for LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+T">Tao Chu</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yuhang Zang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xilin Wei</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zijian Liang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuanjun Xiong</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11833v2-abstract-short" style="display: inline;"> Generating natural and meaningful responses to communicate with multi-modal human inputs is a fundamental capability of Large Vision-Language Models(LVLMs). While current open-source LVLMs demonstrate promising performance in simplified scenarios such as single-turn single-image input, they fall short in real-world conversation scenarios such as following instructions in a long context history wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11833v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11833v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11833v2-abstract-full" style="display: none;"> Generating natural and meaningful responses to communicate with multi-modal human inputs is a fundamental capability of Large Vision-Language Models(LVLMs). While current open-source LVLMs demonstrate promising performance in simplified scenarios such as single-turn single-image input, they fall short in real-world conversation scenarios such as following instructions in a long context history with multi-turn and multi-images. Existing LVLM benchmarks primarily focus on single-choice questions or short-form responses, which do not adequately assess the capabilities of LVLMs in real-world human-AI interaction applications. Therefore, we introduce MMDU, a comprehensive benchmark, and MMDU-45k, a large-scale instruction tuning dataset, designed to evaluate and improve LVLMs' abilities in multi-turn and multi-image conversations. We employ the clustering algorithm to ffnd the relevant images and textual descriptions from the open-source Wikipedia and construct the question-answer pairs by human annotators with the assistance of the GPT-4o model. MMDU has a maximum of 18k image+text tokens, 20 images, and 27 turns, which is at least 5x longer than previous benchmarks and poses challenges to current LVLMs. Our in-depth analysis of 15 representative LVLMs using MMDU reveals that open-source LVLMs lag behind closed-source counterparts due to limited conversational instruction tuning data. We demonstrate that ffne-tuning open-source LVLMs on MMDU-45k signiffcantly address this gap, generating longer and more accurate conversations, and improving scores on MMDU and existing benchmarks (MMStar: +1.1%, MathVista: +1.5%, ChartQA:+1.2%). Our contributions pave the way for bridging the gap between current LVLM models and real-world application demands. This project is available at https://github.com/Liuziyu77/MMDU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11833v2-abstract-full').style.display = 'none'; document.getElementById('2406.11833v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This project is available at https://github.com/Liuziyu77/MMDU</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository