Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,208 results for author: <span class="mathjax">Wu, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wu%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17465">arXiv:2411.17465</a> <span> [<a href="https://arxiv.org/pdf/2411.17465">pdf</a>, <a href="https://arxiv.org/format/2411.17465">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ShowUI: One Vision-Language-Action Model for GUI Visual Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+K+Q">Kevin Qinghong Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+D">Difei Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shiwei Wu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zechen Bai</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+W">Weixian Lei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17465v1-abstract-short" style="display: inline;"> Building Graphical User Interface (GUI) assistants holds significant promise for enhancing human workflow productivity. While most agents are language-based, relying on closed-source API with text-rich meta-information (e.g., HTML or accessibility tree), they show limitations in perceiving UI visuals as humans do, highlighting the need for GUI visual agents. In this work, we develop a vision-langu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17465v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17465v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17465v1-abstract-full" style="display: none;"> Building Graphical User Interface (GUI) assistants holds significant promise for enhancing human workflow productivity. While most agents are language-based, relying on closed-source API with text-rich meta-information (e.g., HTML or accessibility tree), they show limitations in perceiving UI visuals as humans do, highlighting the need for GUI visual agents. In this work, we develop a vision-language-action model in digital world, namely ShowUI, which features the following innovations: (i) UI-Guided Visual Token Selection to reduce computational costs by formulating screenshots as an UI connected graph, adaptively identifying their redundant relationship and serve as the criteria for token selection during self-attention blocks; (ii) Interleaved Vision-Language-Action Streaming that flexibly unifies diverse needs within GUI tasks, enabling effective management of visual-action history in navigation or pairing multi-turn query-action sequences per screenshot to enhance training efficiency; (iii) Small-scale High-quality GUI Instruction-following Datasets by careful data curation and employing a resampling strategy to address significant data type imbalances. With above components, ShowUI, a lightweight 2B model using 256K data, achieves a strong 75.1% accuracy in zero-shot screenshot grounding. Its UI-guided token selection further reduces 33% of redundant visual tokens during training and speeds up the performance by 1.4x. Navigation experiments across web Mind2Web, mobile AITW, and online MiniWob environments further underscore the effectiveness and potential of our model in advancing GUI visual agents. The models are available at https://github.com/showlab/ShowUI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17465v1-abstract-full').style.display = 'none'; document.getElementById('2411.17465v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report. Github: https://github.com/showlab/ShowUI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14922">arXiv:2411.14922</a> <span> [<a href="https://arxiv.org/pdf/2411.14922">pdf</a>, <a href="https://arxiv.org/format/2411.14922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GOT4Rec: Graph of Thoughts for Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+Z">Zewen Long</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14922v1-abstract-short" style="display: inline;"> With the advancement of large language models (LLMs), researchers have explored various methods to optimally leverage their comprehension and generation capabilities in sequential recommendation scenarios. However, several challenges persist in this endeavor. Firstly, most existing approaches rely on the input-output prompting paradigm, which can result in irrelevant or inaccurate responses. Secon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14922v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14922v1-abstract-full" style="display: none;"> With the advancement of large language models (LLMs), researchers have explored various methods to optimally leverage their comprehension and generation capabilities in sequential recommendation scenarios. However, several challenges persist in this endeavor. Firstly, most existing approaches rely on the input-output prompting paradigm, which can result in irrelevant or inaccurate responses. Secondly, while there have been attempts to enhance LLMs using prompting strategies such as chain-of-thought (CoT), these efforts have not fully harnessed the reasoning abilities of LLMs or effectively captured the multifaceted information contained within user sequences. To address these limitations, we propose GOT4Rec, a sequential recommendation method that utilizes the graph of thoughts (GoT) prompting strategy. Specifically, we identify and utilize three key types of information within user history sequences: short-term interests, long-term interests and collaborative information from other users. Our approach enables LLMs to independently reason and generate recommendations based on these distinct types of information, subsequently aggregating the results within the GoT framework to derive the final recommended items. This method allows LLMs, with enhanced reasoning capabilities, to more effectively consider the diverse information within user sequences, resulting in more accurate recommendations and more comprehensive explanations. Extensive experiments on real-world datasets demonstrate the effectiveness of GOT4Rec, indicating that it outperforms existing state-of-the-art baselines. Our code is available at https://anonymous.4open.science/r/GOT4Rec-ED99. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14922v1-abstract-full').style.display = 'none'; document.getElementById('2411.14922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12762">arXiv:2411.12762</a> <span> [<a href="https://arxiv.org/pdf/2411.12762">pdf</a>, <a href="https://arxiv.org/format/2411.12762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Playing Language Game with LLMs Leads to Jailbreaking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yu Peng</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Z">Zewen Long</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+F">Fangming Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Congyi Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12762v1-abstract-short" style="display: inline;"> The advent of large language models (LLMs) has spurred the development of numerous jailbreak techniques aimed at circumventing their security defenses against malicious attacks. An effective jailbreak approach is to identify a domain where safety generalization fails, a phenomenon known as mismatched generalization. In this paper, we introduce two novel jailbreak methods based on mismatched genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12762v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12762v1-abstract-full" style="display: none;"> The advent of large language models (LLMs) has spurred the development of numerous jailbreak techniques aimed at circumventing their security defenses against malicious attacks. An effective jailbreak approach is to identify a domain where safety generalization fails, a phenomenon known as mismatched generalization. In this paper, we introduce two novel jailbreak methods based on mismatched generalization: natural language games and custom language games, both of which effectively bypass the safety mechanisms of LLMs, with various kinds and different variants, making them hard to defend and leading to high attack rates. Natural language games involve the use of synthetic linguistic constructs and the actions intertwined with these constructs, such as the Ubbi Dubbi language. Building on this phenomenon, we propose the custom language games method: by engaging with LLMs using a variety of custom rules, we successfully execute jailbreak attacks across multiple LLM platforms. Extensive experiments demonstrate the effectiveness of our methods, achieving success rates of 93% on GPT-4o, 89% on GPT-4o-mini and 83% on Claude-3.5-Sonnet. Furthermore, to investigate the generalizability of safety alignments, we fine-tuned Llama-3.1-70B with the custom language games to achieve safety alignment within our datasets and found that when interacting through other language games, the fine-tuned models still failed to identify harmful content. This finding indicates that the safety alignment knowledge embedded in LLMs fails to generalize across different linguistic formats, thus opening new avenues for future research in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12762v1-abstract-full').style.display = 'none'; document.getElementById('2411.12762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12183">arXiv:2411.12183</a> <span> [<a href="https://arxiv.org/pdf/2411.12183">pdf</a>, <a href="https://arxiv.org/format/2411.12183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of Beamlines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyu Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shengran Dai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jianhui Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yufei Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junbin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12183v1-abstract-short" style="display: inline;"> Synchrotron radiation sources play a crucial role in fields such as materials science, biology, and chemistry. The beamline, a key subsystem of the synchrotron, modulates and directs the radiation to the sample for analysis. However, the alignment of beamlines is a complex and time-consuming process, primarily carried out manually by experienced engineers. Even minor misalignments in optical compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12183v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12183v1-abstract-full" style="display: none;"> Synchrotron radiation sources play a crucial role in fields such as materials science, biology, and chemistry. The beamline, a key subsystem of the synchrotron, modulates and directs the radiation to the sample for analysis. However, the alignment of beamlines is a complex and time-consuming process, primarily carried out manually by experienced engineers. Even minor misalignments in optical components can significantly affect the beam's properties, leading to suboptimal experimental outcomes. Current automated methods, such as bayesian optimization (BO) and reinforcement learning (RL), although these methods enhance performance, limitations remain. The relationship between the current and target beam properties, crucial for determining the adjustment, is not fully considered. Additionally, the physical characteristics of optical elements are overlooked, such as the need to adjust specific devices to control the output beam's spot size or position. This paper addresses the alignment of beamlines by modeling it as a Markov Decision Process (MDP) and training an intelligent agent using RL. The agent calculates adjustment values based on the current and target beam states, executes actions, and iterates until optimal parameters are achieved. A policy network with action attention is designed to improve decision-making by considering both state differences and the impact of optical components. Experiments on two simulated beamlines demonstrate that our algorithm outperforms existing methods, with ablation studies highlighting the effectiveness of the action attention-based policy network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12183v1-abstract-full').style.display = 'none'; document.getElementById('2411.12183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12146">arXiv:2411.12146</a> <span> [<a href="https://arxiv.org/pdf/2411.12146">pdf</a>, <a href="https://arxiv.org/format/2411.12146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised denoising of visual field data improves detection of glaucoma progression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sean Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+Y">Jun Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Mohammadzadeh%2C+V">Vahid Mohammadzadeh</a>, <a href="/search/cs?searchtype=author&query=Besharati%2C+S">Sajad Besharati</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaewon Lee</a>, <a href="/search/cs?searchtype=author&query=Nouri-Mahdavi%2C+K">Kouros Nouri-Mahdavi</a>, <a href="/search/cs?searchtype=author&query=Caprioli%2C+J">Joseph Caprioli</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhe Fei</a>, <a href="/search/cs?searchtype=author&query=Scalzo%2C+F">Fabien Scalzo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12146v1-abstract-short" style="display: inline;"> Perimetric measurements provide insight into a patient's peripheral vision and day-to-day functioning and are the main outcome measure for identifying progression of visual damage from glaucoma. However, visual field data can be noisy, exhibiting high variance, especially with increasing damage. In this study, we demonstrate the utility of self-supervised deep learning in denoising visual field da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12146v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12146v1-abstract-full" style="display: none;"> Perimetric measurements provide insight into a patient's peripheral vision and day-to-day functioning and are the main outcome measure for identifying progression of visual damage from glaucoma. However, visual field data can be noisy, exhibiting high variance, especially with increasing damage. In this study, we demonstrate the utility of self-supervised deep learning in denoising visual field data from over 4000 patients to enhance its signal-to-noise ratio and its ability to detect true glaucoma progression. We deployed both a variational autoencoder (VAE) and a masked autoencoder to determine which self-supervised model best smooths the visual field data while reconstructing salient features that are less noisy and more predictive of worsening disease. Our results indicate that including a categorical p-value at every visual field location improves the smoothing of visual field data. Masked autoencoders led to cleaner denoised data than previous methods, such as variational autoencoders. A 4.7% increase in detection of progressing eyes with pointwise linear regression (PLR) was observed. The masked and variational autoencoders' smoothed data predicted glaucoma progression 2.3 months earlier when p-values were included compared to when they were not. The faster prediction of time to progression (TTP) and the higher percentage progression detected support our hypothesis that masking out visual field elements during training while including p-values at each location would improve the task of detection of visual field progression. Our study has clinically relevant implications regarding masking when training neural networks to denoise visual field data, resulting in earlier and more accurate detection of glaucoma progression. This denoising model can be integrated into future models for visual field analysis to enhance detection of glaucoma progression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12146v1-abstract-full').style.display = 'none'; document.getElementById('2411.12146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10438">arXiv:2411.10438</a> <span> [<a href="https://arxiv.org/pdf/2411.10438">pdf</a>, <a href="https://arxiv.org/format/2411.10438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MARS: Unleashing the Power of Variance Reduction for Training Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Huizhuo Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifeng Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Quanquan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10438v1-abstract-short" style="display: inline;"> Training deep neural networks--and more recently, large models--demands efficient and scalable optimizers. Adaptive gradient algorithms like Adam, AdamW, and their variants have been central to this task. Despite the development of numerous variance reduction algorithms in the past decade aimed at accelerating stochastic optimization in both convex and nonconvex settings, variance reduction has no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10438v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10438v1-abstract-full" style="display: none;"> Training deep neural networks--and more recently, large models--demands efficient and scalable optimizers. Adaptive gradient algorithms like Adam, AdamW, and their variants have been central to this task. Despite the development of numerous variance reduction algorithms in the past decade aimed at accelerating stochastic optimization in both convex and nonconvex settings, variance reduction has not found widespread success in training deep neural networks or large language models. Consequently, it has remained a less favored approach in modern AI. In this paper, to unleash the power of variance reduction for efficient training of large models, we propose a unified optimization framework, MARS (Make vAriance Reduction Shine), which reconciles preconditioned gradient methods with variance reduction via a scaled stochastic recursive momentum technique. Within our framework, we introduce three instances of MARS that leverage preconditioned gradient updates based on AdamW, Lion, and Shampoo, respectively. We also draw a connection between our algorithms and existing optimizers. Experimental results on training GPT-2 models indicate that MARS consistently outperforms AdamW by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10438v1-abstract-full').style.display = 'none'; document.getElementById('2411.10438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 7 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06334">arXiv:2411.06334</a> <span> [<a href="https://arxiv.org/pdf/2411.06334">pdf</a>, <a href="https://arxiv.org/format/2411.06334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Multicast Scheme for Live Streaming Courses in Large-Scale, Geographically Dense Campus Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Senxin Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinlong Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Ling Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06334v1-abstract-short" style="display: inline;"> Video courses have become a significant component of modern education. However, the increasing demand for live streaming video courses places considerable strain on the service capabilities of campus networks. The challenges associated with live streaming course videos in campus network environments exhibit distinct spatial distribution characteristics. The audience for specific video courses may… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06334v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06334v1-abstract-full" style="display: none;"> Video courses have become a significant component of modern education. However, the increasing demand for live streaming video courses places considerable strain on the service capabilities of campus networks. The challenges associated with live streaming course videos in campus network environments exhibit distinct spatial distribution characteristics. The audience for specific video courses may be highly concentrated in certain areas, leading to a large number of users attempting to access the same live stream simultaneously. Utilizing a Content Delivery Network (CDN) to distribute videos in these campus scenarios creates substantial unicast pressure on edge CDN servers. This paper proposes a two-layer dynamic partitioning Recursive Bit String (RBS) virtual domain network layer multicast architecture specifically designed for large-scale, geographically dense multicast scenarios within campus networks. This approach reduces redundant multicast messages by approximately 10-30\% compared to the two-layer fixed partitioning method. Additionally, it establishes multicast source authentication capabilities based on Source Address Validation Improvement (SAVI) and facilitates secure multicast group key exchange using a concise exchange protocol within the WebRTC framework. In the next-generation data plane of programmable software-defined networks, the RBS stateless multicast technology can be integrated with the unique characteristics of large-scale, geographically dense campus network scenarios to dynamically and efficiently extend multicast coverage to every dormitory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06334v1-abstract-full').style.display = 'none'; document.getElementById('2411.06334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05898">arXiv:2411.05898</a> <span> [<a href="https://arxiv.org/pdf/2411.05898">pdf</a>, <a href="https://arxiv.org/format/2411.05898">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Integrating Object Detection Modality into Visual Language Model for Enhanced Autonomous Driving Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+L">Linfeng He</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiming Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sihao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaxu Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05898v1-abstract-short" style="display: inline;"> In this paper, we propose a novel framework for enhancing visual comprehension in autonomous driving systems by integrating visual language models (VLMs) with additional visual perception module specialised in object detection. We extend the Llama-Adapter architecture by incorporating a YOLOS-based detection network alongside the CLIP perception network, addressing limitations in object detection… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05898v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05898v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05898v1-abstract-full" style="display: none;"> In this paper, we propose a novel framework for enhancing visual comprehension in autonomous driving systems by integrating visual language models (VLMs) with additional visual perception module specialised in object detection. We extend the Llama-Adapter architecture by incorporating a YOLOS-based detection network alongside the CLIP perception network, addressing limitations in object detection and localisation. Our approach introduces camera ID-separators to improve multi-view processing, crucial for comprehensive environmental awareness. Experiments on the DriveLM visual question answering challenge demonstrate significant improvements over baseline models, with enhanced performance in ChatGPT scores, BLEU scores, and CIDEr metrics, indicating closeness of model answer to ground truth. Our method represents a promising step towards more capable and interpretable autonomous driving systems. Possible safety enhancement enabled by detection modality is also discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05898v1-abstract-full').style.display = 'none'; document.getElementById('2411.05898v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by SafeGenAI workshop of NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05591">arXiv:2411.05591</a> <span> [<a href="https://arxiv.org/pdf/2411.05591">pdf</a>, <a href="https://arxiv.org/format/2411.05591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Network EM Algorithm for Gaussian Mixture Model in Decentralized Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bin Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuetong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hansheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05591v1-abstract-short" style="display: inline;"> We systematically study various network Expectation-Maximization (EM) algorithms for the Gaussian mixture model within the framework of decentralized federated learning. Our theoretical investigation reveals that directly extending the classical decentralized supervised learning method to the EM algorithm exhibits poor estimation accuracy with heterogeneous data across clients and struggles to con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05591v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05591v1-abstract-full" style="display: none;"> We systematically study various network Expectation-Maximization (EM) algorithms for the Gaussian mixture model within the framework of decentralized federated learning. Our theoretical investigation reveals that directly extending the classical decentralized supervised learning method to the EM algorithm exhibits poor estimation accuracy with heterogeneous data across clients and struggles to converge numerically when Gaussian components are poorly-separated. To address these issues, we propose two novel solutions. First, to handle heterogeneous data, we introduce a momentum network EM (MNEM) algorithm, which uses a momentum parameter to combine information from both the current and historical estimators. Second, to tackle the challenge of poorly-separated Gaussian components, we develop a semi-supervised MNEM (semi-MNEM) algorithm, which leverages partially labeled data. Rigorous theoretical analysis demonstrates that MNEM can achieve statistical efficiency comparable to that of the whole sample estimator when the mixture components satisfy certain separation conditions, even in heterogeneous scenarios. Moreover, the semi-MNEM estimator enhances the convergence speed of the MNEM algorithm, effectively addressing the numerical convergence challenges in poorly-separated scenarios. Extensive simulation and real data analyses are conducted to justify our theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05591v1-abstract-full').style.display = 'none'; document.getElementById('2411.05591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05292">arXiv:2411.05292</a> <span> [<a href="https://arxiv.org/pdf/2411.05292">pdf</a>, <a href="https://arxiv.org/format/2411.05292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SimpleBEV: Improved LiDAR-Camera Fusion Architecture for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yun Zhao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhan Gong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+P">Peiru Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hong Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaohua Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05292v1-abstract-short" style="display: inline;"> More and more research works fuse the LiDAR and camera information to improve the 3D object detection of the autonomous driving system. Recently, a simple yet effective fusion framework has achieved an excellent detection performance, fusing the LiDAR and camera features in a unified bird's-eye-view (BEV) space. In this paper, we propose a LiDAR-camera fusion framework, named SimpleBEV, for accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05292v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05292v1-abstract-full" style="display: none;"> More and more research works fuse the LiDAR and camera information to improve the 3D object detection of the autonomous driving system. Recently, a simple yet effective fusion framework has achieved an excellent detection performance, fusing the LiDAR and camera features in a unified bird's-eye-view (BEV) space. In this paper, we propose a LiDAR-camera fusion framework, named SimpleBEV, for accurate 3D object detection, which follows the BEV-based fusion framework and improves the camera and LiDAR encoders, respectively. Specifically, we perform the camera-based depth estimation using a cascade network and rectify the depth results with the depth information derived from the LiDAR points. Meanwhile, an auxiliary branch that implements the 3D object detection using only the camera-BEV features is introduced to exploit the camera information during the training phase. Besides, we improve the LiDAR feature extractor by fusing the multi-scaled sparse convolutional features. Experimental results demonstrate the effectiveness of our proposed method. Our method achieves 77.6\% NDS accuracy on the nuScenes dataset, showcasing superior performance in the 3D object detection track. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05292v1-abstract-full').style.display = 'none'; document.getElementById('2411.05292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04844">arXiv:2411.04844</a> <span> [<a href="https://arxiv.org/pdf/2411.04844">pdf</a>, <a href="https://arxiv.org/format/2411.04844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Gaussian Representation for Incomplete CT Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaokai Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuxiang Lu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Suizhi Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fengyu Yang</a>, <a href="/search/cs?searchtype=author&query=Sirejiding%2C+S">Shalayiding Sirejiding</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qichen He</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+J">Jing Tong</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yanbiao Ji</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yue Ding</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongtao Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04844v1-abstract-short" style="display: inline;"> Incomplete Computed Tomography (CT) benefits patients by reducing radiation exposure. However, reconstructing high-fidelity images from limited views or angles remains challenging due to the ill-posed nature of the problem. Deep Learning Reconstruction (DLR) methods have shown promise in enhancing image quality, but the paradox between training data diversity and high generalization ability remain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04844v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04844v1-abstract-full" style="display: none;"> Incomplete Computed Tomography (CT) benefits patients by reducing radiation exposure. However, reconstructing high-fidelity images from limited views or angles remains challenging due to the ill-posed nature of the problem. Deep Learning Reconstruction (DLR) methods have shown promise in enhancing image quality, but the paradox between training data diversity and high generalization ability remains unsolved. In this paper, we propose a novel Gaussian Representation for Incomplete CT Reconstruction (GRCT) without the usage of any neural networks or full-dose CT data. Specifically, we model the 3D volume as a set of learnable Gaussians, which are optimized directly from the incomplete sinogram. Our method can be applied to multiple views and angles without changing the architecture. Additionally, we propose a differentiable Fast CT Reconstruction method for efficient clinical usage. Extensive experiments on multiple datasets and settings demonstrate significant improvements in reconstruction quality metrics and high efficiency. We plan to release our code as open-source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04844v1-abstract-full').style.display = 'none'; document.getElementById('2411.04844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02592">arXiv:2411.02592</a> <span> [<a href="https://arxiv.org/pdf/2411.02592">pdf</a>, <a href="https://arxiv.org/format/2411.02592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Data Augmentation for Improving Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruoxin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke-Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiamu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shouli Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shouhong Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02592v1-abstract-short" style="display: inline;"> Recent advancements in image mixing and generative data augmentation have shown promise in enhancing image classification. However, these techniques face the challenge of balancing semantic fidelity with diversity. Specifically, image mixing involves interpolating two images to create a new one, but this pixel-level interpolation can compromise fidelity. Generative augmentation uses text-to-image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02592v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02592v1-abstract-full" style="display: none;"> Recent advancements in image mixing and generative data augmentation have shown promise in enhancing image classification. However, these techniques face the challenge of balancing semantic fidelity with diversity. Specifically, image mixing involves interpolating two images to create a new one, but this pixel-level interpolation can compromise fidelity. Generative augmentation uses text-to-image generative models to synthesize or modify images, often limiting diversity to avoid generating out-of-distribution data that potentially affects accuracy. We propose that this fidelity-diversity dilemma partially stems from the whole-image paradigm of existing methods. Since an image comprises the class-dependent part (CDP) and the class-independent part (CIP), where each part has fundamentally different impacts on the image's fidelity, treating different parts uniformly can therefore be misleading. To address this fidelity-diversity dilemma, we introduce Decoupled Data Augmentation (De-DA), which resolves the dilemma by separating images into CDPs and CIPs and handling them adaptively. To maintain fidelity, we use generative models to modify real CDPs under controlled conditions, preserving semantic consistency. To enhance diversity, we replace the image's CIP with inter-class variants, creating diverse CDP-CIP combinations. Additionally, we implement an online randomized combination strategy during training to generate numerous distinct CDP-CIP combinations cost-effectively. Comprehensive empirical evaluations validate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02592v1-abstract-full').style.display = 'none'; document.getElementById('2411.02592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02576">arXiv:2411.02576</a> <span> [<a href="https://arxiv.org/pdf/2411.02576">pdf</a>, <a href="https://arxiv.org/format/2411.02576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Designing and Evaluating Sampling Strategies for Multiple-Forecast Visualization (MFV) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+R">Ruishi Zou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyi Wu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bingsheng Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dakuo Wang</a>, <a href="/search/cs?searchtype=author&query=Padilla%2C+L">Lace Padilla</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02576v1-abstract-short" style="display: inline;"> With the growing availability of quantitative forecasts from various sources, effectively communicating these multiple forecasts has become increasingly crucial. Recent advances have explored using Multiple-Forecast Visualizations (MFVs) to display multiple time-series forecasts. However, how to systematically sample from a pool of disparate forecasts to create MFVs that effectively facilitate dec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02576v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02576v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02576v1-abstract-full" style="display: none;"> With the growing availability of quantitative forecasts from various sources, effectively communicating these multiple forecasts has become increasingly crucial. Recent advances have explored using Multiple-Forecast Visualizations (MFVs) to display multiple time-series forecasts. However, how to systematically sample from a pool of disparate forecasts to create MFVs that effectively facilitate decision-making requires further investigation. To address this challenge, we examine two cluster-based sampling strategies for creating MFVs and three designs for visualizing them to assist people in decision-making with forecasts. Through two online studies (Experiment 1 n = 711 and Experiment 2 n = 400) and over 15 decision-making-related metrics, we evaluated participants' perceptions of eight visualization designs using historical COVID-19 forecasts as a test bed. Our findings revealed that one sampling method significantly enhanced participants' ability to predict future outcomes, thereby reducing their surprise when confronted with the actual outcomes. Importantly, since no approach excels in all metrics, we advise choosing different visualization designs based on communication goals. Furthermore, qualitative response data demonstrate a correlation between response consistency and people's inclination to extrapolate from the forecast segment of the visualization. This research offers insights into how to improve visualizations of multiple forecasts using an automated and empirically validated technique for selecting forecasts that outperform common techniques on several key metrics and reduce overplotting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02576v1-abstract-full').style.display = 'none'; document.getElementById('2411.02576v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01830">arXiv:2411.01830</a> <span> [<a href="https://arxiv.org/pdf/2411.01830">pdf</a>, <a href="https://arxiv.org/format/2411.01830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FaaSTube: Optimizing GPU-oriented Data Transfer for Serverless Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Junxiao Deng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Minchen Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaochen Liu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hao Fan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Song Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01830v1-abstract-short" style="display: inline;"> Serverless computing has gained significant traction for machine learning inference applications, which are often deployed as serverless workflows consisting of multiple CPU and GPU functions with data dependency. However, existing data-passing solutions for serverless computing primarily reply on host memory for fast data transfer, mandating substantial data movement and resulting in salient I/O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01830v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01830v1-abstract-full" style="display: none;"> Serverless computing has gained significant traction for machine learning inference applications, which are often deployed as serverless workflows consisting of multiple CPU and GPU functions with data dependency. However, existing data-passing solutions for serverless computing primarily reply on host memory for fast data transfer, mandating substantial data movement and resulting in salient I/O overhead. In this paper, we present FaaSTube, a GPU-efficient data passing system for serverless inference. FaaSTube manages intermediate data within a GPU memory pool to facilitate direct data exchange between GPU functions. It enables fine-grained bandwidth sharing over PCIe and NVLink, minimizing data-passing latency for both host-to-GPU and GPU-to-GPU while providing performance isolation between functions. Additionally, FaaSTube implements an elastic GPU memory pool that dynamically scales to accommodate varying data-passing demands. Evaluations on real-world applications show that FaaSTube reduces end-to-end latency by up to 90\% and achieves up to 12x higher throughput compared to the state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01830v1-abstract-full').style.display = 'none'; document.getElementById('2411.01830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01169">arXiv:2411.01169</a> <span> [<a href="https://arxiv.org/pdf/2411.01169">pdf</a>, <a href="https://arxiv.org/format/2411.01169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TKDE.2024.3397683">10.1109/TKDE.2024.3397683 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bi-Level Graph Structure Learning for Next POI Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanqiao Zhu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiang Tao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengdi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01169v1-abstract-short" style="display: inline;"> Next point-of-interest (POI) recommendation aims to predict a user's next destination based on sequential check-in history and a set of POI candidates. Graph neural networks (GNNs) have demonstrated a remarkable capability in this endeavor by exploiting the extensive global collaborative signals present among POIs. However, most of the existing graph-based approaches construct graph structures bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01169v1-abstract-full" style="display: none;"> Next point-of-interest (POI) recommendation aims to predict a user's next destination based on sequential check-in history and a set of POI candidates. Graph neural networks (GNNs) have demonstrated a remarkable capability in this endeavor by exploiting the extensive global collaborative signals present among POIs. However, most of the existing graph-based approaches construct graph structures based on pre-defined heuristics, failing to consider inherent hierarchical structures of POI features such as geographical locations and visiting peaks, or suffering from noisy and incomplete structures in graphs. To address the aforementioned issues, this paper presents a novel Bi-level Graph Structure Learning (BiGSL) for next POI recommendation. BiGSL first learns a hierarchical graph structure to capture the fine-to-coarse connectivity between POIs and prototypes, and then uses a pairwise learning module to dynamically infer relationships between POI pairs and prototype pairs. Based on the learned bi-level graphs, our model then employs a multi-relational graph network that considers both POI- and prototype-level neighbors, resulting in improved POI representations. Our bi-level structure learning scheme is more robust to data noise and incompleteness, and improves the exploration ability for recommendation by alleviating sparsity issues. Experimental results on three real-world datasets demonstrate the superiority of our model over existing state-of-the-art methods, with a significant improvement in recommendation accuracy and exploration performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01169v1-abstract-full').style.display = 'none'; document.getElementById('2411.01169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Knowledge and Data Engineering</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Knowledge and Data Engineering, vol. 36, no. 11, pp. 5695-5708, Nov. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01158">arXiv:2411.01158</a> <span> [<a href="https://arxiv.org/pdf/2411.01158">pdf</a>, <a href="https://arxiv.org/format/2411.01158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Molecular Networks">q-bio.MN</span> </div> </div> <p class="title is-5 mathjax"> Pin-Tuning: Parameter-Efficient In-Context Tuning for Few-Shot Molecular Property Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaozhen Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xin Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01158v1-abstract-short" style="display: inline;"> Molecular property prediction (MPP) is integral to drug discovery and material science, but often faces the challenge of data scarcity in real-world scenarios. Addressing this, few-shot molecular property prediction (FSMPP) has been developed. Unlike other few-shot tasks, FSMPP typically employs a pre-trained molecular encoder and a context-aware classifier, benefiting from molecular pre-training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01158v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01158v1-abstract-full" style="display: none;"> Molecular property prediction (MPP) is integral to drug discovery and material science, but often faces the challenge of data scarcity in real-world scenarios. Addressing this, few-shot molecular property prediction (FSMPP) has been developed. Unlike other few-shot tasks, FSMPP typically employs a pre-trained molecular encoder and a context-aware classifier, benefiting from molecular pre-training and molecular context information. Despite these advancements, existing methods struggle with the ineffective fine-tuning of pre-trained encoders. We attribute this issue to the imbalance between the abundance of tunable parameters and the scarcity of labeled molecules, and the lack of contextual perceptiveness in the encoders. To overcome this hurdle, we propose a parameter-efficient in-context tuning method, named Pin-Tuning. Specifically, we propose a lightweight adapter for pre-trained message passing layers (MP-Adapter) and Bayesian weight consolidation for pre-trained atom/bond embedding layers (Emb-BWC), to achieve parameter-efficient tuning while preventing over-fitting and catastrophic forgetting. Additionally, we enhance the MP-Adapters with contextual perceptiveness. This innovation allows for in-context tuning of the pre-trained encoder, thereby improving its adaptability for specific FSMPP tasks. When evaluated on public datasets, our method demonstrates superior tuning with fewer trainable parameters, improving few-shot predictive performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01158v1-abstract-full').style.display = 'none'; document.getElementById('2411.01158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00837">arXiv:2411.00837</a> <span> [<a href="https://arxiv.org/pdf/2411.00837">pdf</a>, <a href="https://arxiv.org/format/2411.00837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Longitudinal Mammogram Exam-based Breast Cancer Diagnosis Models: Vulnerability to Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhengbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+D">Degan Hao</a>, <a href="/search/cs?searchtype=author&query=Arefan%2C+D">Dooman Arefan</a>, <a href="/search/cs?searchtype=author&query=Zuley%2C+M">Margarita Zuley</a>, <a href="/search/cs?searchtype=author&query=Sumkin%2C+J">Jules Sumkin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shandong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00837v1-abstract-short" style="display: inline;"> In breast cancer detection and diagnosis, the longitudinal analysis of mammogram images is crucial. Contemporary models excel in detecting temporal imaging feature changes, thus enhancing the learning process over sequential imaging exams. Yet, the resilience of these longitudinal models against adversarial attacks remains underexplored. In this study, we proposed a novel attack method that capita… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00837v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00837v1-abstract-full" style="display: none;"> In breast cancer detection and diagnosis, the longitudinal analysis of mammogram images is crucial. Contemporary models excel in detecting temporal imaging feature changes, thus enhancing the learning process over sequential imaging exams. Yet, the resilience of these longitudinal models against adversarial attacks remains underexplored. In this study, we proposed a novel attack method that capitalizes on the feature-level relationship between two sequential mammogram exams of a longitudinal model, guided by both cross-entropy loss and distance metric learning, to achieve significant attack efficacy, as implemented using attack transferring in a black-box attacking manner. We performed experiments on a cohort of 590 breast cancer patients (each has two sequential mammogram exams) in a case-control setting. Results showed that our proposed method surpassed several state-of-the-art adversarial attacks in fooling the diagnosis models to give opposite outputs. Our method remained effective even if the model was trained with the common defending method of adversarial training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00837v1-abstract-full').style.display = 'none'; document.getElementById('2411.00837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23601">arXiv:2410.23601</a> <span> [<a href="https://arxiv.org/pdf/2410.23601">pdf</a>, <a href="https://arxiv.org/format/2410.23601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Stabilizing Linear Passive-Aggressive Online Learning with Weighted Reservoir Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Skyler Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+F">Fred Lu</a>, <a href="/search/cs?searchtype=author&query=Raff%2C+E">Edward Raff</a>, <a href="/search/cs?searchtype=author&query=Holt%2C+J">James Holt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23601v1-abstract-short" style="display: inline;"> Online learning methods, like the seminal Passive-Aggressive (PA) classifier, are still highly effective for high-dimensional streaming data, out-of-core processing, and other throughput-sensitive applications. Many such algorithms rely on fast adaptation to individual errors as a key to their convergence. While such algorithms enjoy low theoretical regret, in real-world deployment they can be sen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23601v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23601v1-abstract-full" style="display: none;"> Online learning methods, like the seminal Passive-Aggressive (PA) classifier, are still highly effective for high-dimensional streaming data, out-of-core processing, and other throughput-sensitive applications. Many such algorithms rely on fast adaptation to individual errors as a key to their convergence. While such algorithms enjoy low theoretical regret, in real-world deployment they can be sensitive to individual outliers that cause the algorithm to over-correct. When such outliers occur at the end of the data stream, this can cause the final solution to have unexpectedly low accuracy. We design a weighted reservoir sampling (WRS) approach to obtain a stable ensemble model from the sequence of solutions without requiring additional passes over the data, hold-out sets, or a growing amount of memory. Our key insight is that good solutions tend to be error-free for more iterations than bad solutions, and thus, the number of passive rounds provides an estimate of a solution's relative quality. Our reservoir thus contains $K$ previous intermediate weight vectors with high survival times. We demonstrate our WRS approach on the Passive-Aggressive Classifier (PAC) and First-Order Sparse Online Learning (FSOL), where our method consistently and significantly outperforms the unmodified approach. We show that the risk of the ensemble classifier is bounded with respect to the regret of the underlying online learning method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23601v1-abstract-full').style.display = 'none'; document.getElementById('2410.23601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23074">arXiv:2410.23074</a> <span> [<a href="https://arxiv.org/pdf/2410.23074">pdf</a>, <a href="https://arxiv.org/format/2410.23074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Programming Language Sandbox for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dou%2C+S">Shihan Dou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiazheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+J">Jianxiang Zang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yunbo Tao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weikang Zhou</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+H">Haoxiang Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shichun Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuming Yang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shenxi Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Muling Wu</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Changze Lv</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Limao Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wenyu Zhan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+R">Rongxiang Weng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yueming Wu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Ming Wen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+R">Rui Zheng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+T">Tao Ji</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yixin Cao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+T">Tao Gui</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23074v2-abstract-short" style="display: inline;"> We introduce MPLSandbox, an out-of-the-box multi-programming language sandbox designed to provide unified and comprehensive feedback from compiler and analysis tools for Large Language Models (LLMs). It can automatically identify the programming language of the code, compiling and executing it within an isolated sub-sandbox to ensure safety and stability. In addition, MPLSandbox also integrates bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23074v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23074v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23074v2-abstract-full" style="display: none;"> We introduce MPLSandbox, an out-of-the-box multi-programming language sandbox designed to provide unified and comprehensive feedback from compiler and analysis tools for Large Language Models (LLMs). It can automatically identify the programming language of the code, compiling and executing it within an isolated sub-sandbox to ensure safety and stability. In addition, MPLSandbox also integrates both traditional and LLM-based code analysis tools, providing a comprehensive analysis of generated code. MPLSandbox can be effortlessly integrated into the training and deployment of LLMs to improve the quality and correctness of their generated code. It also helps researchers streamline their workflows for various LLM-based code-related tasks, reducing the development cost. To validate the effectiveness of MPLSandbox, we integrate it into training and deployment approaches, and also employ it to optimize workflows for a wide range of real-world code-related tasks. Our goal is to enhance researcher productivity on LLM-based code-related tasks by simplifying and automating workflows through delegation to MPLSandbox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23074v2-abstract-full').style.display = 'none'; document.getElementById('2410.23074v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22715">arXiv:2410.22715</a> <span> [<a href="https://arxiv.org/pdf/2410.22715">pdf</a>, <a href="https://arxiv.org/format/2410.22715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SCRREAM : SCan, Register, REnder And Map:A Framework for Annotating Accurate and Dense 3D Indoor Scenes with a Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jung%2C+H">HyunJun Jung</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weihang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shun-Cheng Wu</a>, <a href="/search/cs?searchtype=author&query=Bittner%2C+W">William Bittner</a>, <a href="/search/cs?searchtype=author&query=Brasch%2C+N">Nikolas Brasch</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jifei Song</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez-Pellitero%2C+E">Eduardo P茅rez-Pellitero</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhensong Zhang</a>, <a href="/search/cs?searchtype=author&query=Moreau%2C+A">Arthur Moreau</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Busam%2C+B">Benjamin Busam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22715v1-abstract-short" style="display: inline;"> Traditionally, 3d indoor datasets have generally prioritized scale over ground-truth accuracy in order to obtain improved generalization. However, using these datasets to evaluate dense geometry tasks, such as depth rendering, can be problematic as the meshes of the dataset are often incomplete and may produce wrong ground truth to evaluate the details. In this paper, we propose SCRREAM, a dataset… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22715v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22715v1-abstract-full" style="display: none;"> Traditionally, 3d indoor datasets have generally prioritized scale over ground-truth accuracy in order to obtain improved generalization. However, using these datasets to evaluate dense geometry tasks, such as depth rendering, can be problematic as the meshes of the dataset are often incomplete and may produce wrong ground truth to evaluate the details. In this paper, we propose SCRREAM, a dataset annotation framework that allows annotation of fully dense meshes of objects in the scene and registers camera poses on the real image sequence, which can produce accurate ground truth for both sparse 3D as well as dense 3D tasks. We show the details of the dataset annotation pipeline and showcase four possible variants of datasets that can be obtained from our framework with example scenes, such as indoor reconstruction and SLAM, scene editing & object removal, human reconstruction and 6d pose estimation. Recent pipelines for indoor reconstruction and SLAM serve as new benchmarks. In contrast to previous indoor dataset, our design allows to evaluate dense geometry tasks on eleven sample scenes against accurately rendered ground truth depth maps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22715v1-abstract-full').style.display = 'none'; document.getElementById('2410.22715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22473">arXiv:2410.22473</a> <span> [<a href="https://arxiv.org/pdf/2410.22473">pdf</a>, <a href="https://arxiv.org/format/2410.22473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> The State of Data Curation at NeurIPS: An Assessment of Dataset Development Practices in the Datasets and Benchmarks Track </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhardwaj%2C+E">Eshta Bhardwaj</a>, <a href="/search/cs?searchtype=author&query=Gujral%2C+H">Harshit Gujral</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyi Wu</a>, <a href="/search/cs?searchtype=author&query=Zogheib%2C+C">Ciara Zogheib</a>, <a href="/search/cs?searchtype=author&query=Maharaj%2C+T">Tegan Maharaj</a>, <a href="/search/cs?searchtype=author&query=Becker%2C+C">Christoph Becker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22473v1-abstract-short" style="display: inline;"> Data curation is a field with origins in librarianship and archives, whose scholarship and thinking on data issues go back centuries, if not millennia. The field of machine learning is increasingly observing the importance of data curation to the advancement of both applications and fundamental understanding of machine learning models - evidenced not least by the creation of the Datasets and Bench… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22473v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22473v1-abstract-full" style="display: none;"> Data curation is a field with origins in librarianship and archives, whose scholarship and thinking on data issues go back centuries, if not millennia. The field of machine learning is increasingly observing the importance of data curation to the advancement of both applications and fundamental understanding of machine learning models - evidenced not least by the creation of the Datasets and Benchmarks track itself. This work provides an analysis of dataset development practices at NeurIPS through the lens of data curation. We present an evaluation framework for dataset documentation, consisting of a rubric and toolkit developed through a literature review of data curation principles. We use the framework to assess the strengths and weaknesses in current dataset development practices of 60 datasets published in the NeurIPS Datasets and Benchmarks track from 2021-2023. We summarize key findings and trends. Results indicate greater need for documentation about environmental footprint, ethical considerations, and data management. We suggest targeted strategies and resources to improve documentation in these areas and provide recommendations for the NeurIPS peer-review process that prioritize rigorous data curation in ML. Finally, we provide results in the format of a dataset that showcases aspects of recommended data curation practices. Our rubric and results are of interest for improving data curation practices broadly in the field of ML as well as to data curation and science and technology studies scholars studying practices in ML. Our aim is to support continued improvement in interdisciplinary research on dataset practices, ultimately improving the reusability and reproducibility of new datasets and benchmarks, enabling standardized and informed human oversight, and strengthening the foundation of rigorous and responsible ML research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22473v1-abstract-full').style.display = 'none'; document.getElementById('2410.22473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in NeurIPS Datasets & Benchmarks track 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21970">arXiv:2410.21970</a> <span> [<a href="https://arxiv.org/pdf/2410.21970">pdf</a>, <a href="https://arxiv.org/format/2410.21970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Not All Languages are Equal: Insights into Multilingual Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Suhang Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ante Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+K">Kaidi Jia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Junfeng Yao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jinsong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21970v1-abstract-short" style="display: inline;"> RALMs (Retrieval-Augmented Language Models) broaden their knowledge scope by incorporating external textual resources. However, the multilingual nature of global knowledge necessitates RALMs to handle diverse languages, a topic that has received limited research focus. In this work, we propose \textit{Futurepedia}, a carefully crafted benchmark containing parallel texts across eight representative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21970v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21970v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21970v1-abstract-full" style="display: none;"> RALMs (Retrieval-Augmented Language Models) broaden their knowledge scope by incorporating external textual resources. However, the multilingual nature of global knowledge necessitates RALMs to handle diverse languages, a topic that has received limited research focus. In this work, we propose \textit{Futurepedia}, a carefully crafted benchmark containing parallel texts across eight representative languages. We evaluate six multilingual RALMs using our benchmark to explore the challenges of multilingual RALMs. Experimental results reveal linguistic inequalities: 1) high-resource languages stand out in Monolingual Knowledge Extraction; 2) Indo-European languages lead RALMs to provide answers directly from documents, alleviating the challenge of expressing answers across languages; 3) English benefits from RALMs' selection bias and speaks louder in multilingual knowledge selection. Based on these findings, we offer advice for improving multilingual Retrieval Augmented Generation. For monolingual knowledge extraction, careful attention must be paid to cascading errors from translating low-resource languages into high-resource ones. In cross-lingual knowledge transfer, encouraging RALMs to provide answers within documents in different languages can improve transfer performance. For multilingual knowledge selection, incorporating more non-English documents and repositioning English documents can help mitigate RALMs' selection bias. Through comprehensive experiments, we underscore the complexities inherent in multilingual RALMs and offer valuable insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21970v1-abstract-full').style.display = 'none'; document.getElementById('2410.21970v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21332">arXiv:2410.21332</a> <span> [<a href="https://arxiv.org/pdf/2410.21332">pdf</a>, <a href="https://arxiv.org/format/2410.21332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Building, Reusing, and Generalizing Abstract Representations from Concrete Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuchen Wu</a>, <a href="/search/cs?searchtype=author&query=Thalmann%2C+M">Mirko Thalmann</a>, <a href="/search/cs?searchtype=author&query=Dayan%2C+P">Peter Dayan</a>, <a href="/search/cs?searchtype=author&query=Akata%2C+Z">Zeynep Akata</a>, <a href="/search/cs?searchtype=author&query=Schulz%2C+E">Eric Schulz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21332v1-abstract-short" style="display: inline;"> Humans excel at learning abstract patterns across different sequences, filtering out irrelevant details, and transferring these generalized concepts to new sequences. In contrast, many sequence learning models lack the ability to abstract, which leads to memory inefficiency and poor transfer. We introduce a non-parametric hierarchical variable learning model (HVM) that learns chunks from sequences… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21332v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21332v1-abstract-full" style="display: none;"> Humans excel at learning abstract patterns across different sequences, filtering out irrelevant details, and transferring these generalized concepts to new sequences. In contrast, many sequence learning models lack the ability to abstract, which leads to memory inefficiency and poor transfer. We introduce a non-parametric hierarchical variable learning model (HVM) that learns chunks from sequences and abstracts contextually similar chunks as variables. HVM efficiently organizes memory while uncovering abstractions, leading to compact sequence representations. When learning on language datasets such as babyLM, HVM learns a more efficient dictionary than standard compression algorithms such as Lempel-Ziv. In a sequence recall task requiring the acquisition and transfer of variables embedded in sequences, we demonstrate HVM's sequence likelihood correlates with human recall times. In contrast, large language models (LLMs) struggle to transfer abstract variables as effectively as humans. From HVM's adjustable layer of abstraction, we demonstrate that the model realizes a precise trade-off between compression and generalization. Our work offers a cognitive model that captures the learning and transfer of abstract representations in human cognition and differentiates itself from the behavior of large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21332v1-abstract-full').style.display = 'none'; document.getElementById('2410.21332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21218">arXiv:2410.21218</a> <span> [<a href="https://arxiv.org/pdf/2410.21218">pdf</a>, <a href="https://arxiv.org/format/2410.21218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Lifting the Veil on the Large Language Model Supply Chain: Composition, Risks, and Mitigations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaifeng Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bihuan Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">You Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Susheng Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dingji Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiheng Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhuotong Zhou</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Junming Cao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21218v2-abstract-short" style="display: inline;"> Large language models (LLM) have sparked significant impact with regard to both intelligence and productivity. In recent years, a great surge has been witnessed in the introduction of both commercial and open-source LLMs. Many businesses have adopted the LLMs into their applications to solve their own domain-specific tasks. However, integrating LLMs into specific business scenarios requires more t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21218v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21218v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21218v2-abstract-full" style="display: none;"> Large language models (LLM) have sparked significant impact with regard to both intelligence and productivity. In recent years, a great surge has been witnessed in the introduction of both commercial and open-source LLMs. Many businesses have adopted the LLMs into their applications to solve their own domain-specific tasks. However, integrating LLMs into specific business scenarios requires more than just utilizing the models themselves. Instead, it is a systematic process that involves substantial components, which are collectively referred to as the LLM supply chain. The LLM supply chain inherently carries risks. Therefore, it is essential to understand the types of components that may be introduced into the supply chain and the associated risks, enabling different stakeholders to implement effective mitigation measures. While some literature discusses risks associated with LLMs, there is currently no paper that clearly outlines the LLM supply chain from the perspective of both providing and consuming its components. As LLMs have become essential infrastructure in the new era, we believe that a thorough review of the LLM supply chain, along with its inherent risks and mitigation strategies, would be valuable for industry practitioners to avoid potential damages and losses, and enlightening for academic researchers to rethink existing approaches and explore new avenues of research. Our paper provides a comprehensive overview of the LLM supply chain, detailing the stakeholders, composing artifacts, and the supplying types. We developed taxonomies of risk types, risky actions, and mitigations related to various supply chain stakeholders and components. In summary, our work explores the technical and operational aspects of the LLM supply chain, offering valuable insights for researchers and engineers in the evolving LLM landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21218v2-abstract-full').style.display = 'none'; document.getElementById('2410.21218v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21112">arXiv:2410.21112</a> <span> [<a href="https://arxiv.org/pdf/2410.21112">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> Magnetic Milli-spinner for Robotic Endovascular Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuai Wu</a>, <a href="/search/cs?searchtype=author&query=Leanza%2C+S">Sophie Leanza</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lu Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yilong Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Stone%2C+D">Diego Stone</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R+R">Ruike Renee Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21112v1-abstract-short" style="display: inline;"> Vascular diseases such as thrombosis, atherosclerosis, and aneurysm, which can lead to blockage of blood flow or blood vessel rupture, are common and life-threatening. Conventional minimally invasive treatments utilize catheters, or long tubes, to guide small devices or therapeutic agents to targeted regions for intervention. Unfortunately, catheters suffer from difficult and unreliable navigation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21112v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21112v1-abstract-full" style="display: none;"> Vascular diseases such as thrombosis, atherosclerosis, and aneurysm, which can lead to blockage of blood flow or blood vessel rupture, are common and life-threatening. Conventional minimally invasive treatments utilize catheters, or long tubes, to guide small devices or therapeutic agents to targeted regions for intervention. Unfortunately, catheters suffer from difficult and unreliable navigation in narrow, winding vessels such as those found in the brain. Magnetically actuated untethered robots, which have been extensively explored as an alternative, are promising for navigation in complex vasculatures and vascular disease treatments. Most current robots, however, cannot swim against high flows or are inadequate in treating certain conditions. Here, we introduce a multifunctional and magnetically actuated milli-spinner robot for rapid navigation and performance of various treatments in complicated vasculatures. The milli-spinner, with a unique hollow structure including helical fins and slits for propulsion, generates a distinct flow field upon spinning. The milli-spinner is the fastest-ever untethered magnetic robot for movement in tubular environments, easily achieving speeds of 23 cm/s, demonstrating promise as an untethered medical device for effective navigation in blood vessels and robotic treatment of numerous vascular diseases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21112v1-abstract-full').style.display = 'none'; document.getElementById('2410.21112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20268">arXiv:2410.20268</a> <span> [<a href="https://arxiv.org/pdf/2410.20268">pdf</a>, <a href="https://arxiv.org/format/2410.20268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Centaur: a foundation model of human cognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Binz%2C+M">Marcel Binz</a>, <a href="/search/cs?searchtype=author&query=Akata%2C+E">Elif Akata</a>, <a href="/search/cs?searchtype=author&query=Bethge%2C+M">Matthias Bethge</a>, <a href="/search/cs?searchtype=author&query=Br%C3%A4ndle%2C+F">Franziska Br盲ndle</a>, <a href="/search/cs?searchtype=author&query=Callaway%2C+F">Fred Callaway</a>, <a href="/search/cs?searchtype=author&query=Coda-Forno%2C+J">Julian Coda-Forno</a>, <a href="/search/cs?searchtype=author&query=Dayan%2C+P">Peter Dayan</a>, <a href="/search/cs?searchtype=author&query=Demircan%2C+C">Can Demircan</a>, <a href="/search/cs?searchtype=author&query=Eckstein%2C+M+K">Maria K. Eckstein</a>, <a href="/search/cs?searchtype=author&query=%C3%89ltet%C5%91%2C+N">No茅mi 脡ltet艖</a>, <a href="/search/cs?searchtype=author&query=Griffiths%2C+T+L">Thomas L. Griffiths</a>, <a href="/search/cs?searchtype=author&query=Haridi%2C+S">Susanne Haridi</a>, <a href="/search/cs?searchtype=author&query=Jagadish%2C+A+K">Akshay K. Jagadish</a>, <a href="/search/cs?searchtype=author&query=Ji-An%2C+L">Li Ji-An</a>, <a href="/search/cs?searchtype=author&query=Kipnis%2C+A">Alexander Kipnis</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sreejan Kumar</a>, <a href="/search/cs?searchtype=author&query=Ludwig%2C+T">Tobias Ludwig</a>, <a href="/search/cs?searchtype=author&query=Mathony%2C+M">Marvin Mathony</a>, <a href="/search/cs?searchtype=author&query=Mattar%2C+M">Marcelo Mattar</a>, <a href="/search/cs?searchtype=author&query=Modirshanechi%2C+A">Alireza Modirshanechi</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+S+S">Surabhi S. Nath</a>, <a href="/search/cs?searchtype=author&query=Peterson%2C+J+C">Joshua C. Peterson</a>, <a href="/search/cs?searchtype=author&query=Rmus%2C+M">Milena Rmus</a>, <a href="/search/cs?searchtype=author&query=Russek%2C+E+M">Evan M. Russek</a>, <a href="/search/cs?searchtype=author&query=Saanum%2C+T">Tankred Saanum</a> , et al. (16 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20268v2-abstract-short" style="display: inline;"> Establishing a unified theory of cognition has been a major goal of psychology. While there have been previous attempts to instantiate such theories by building computational models, we currently do not have one model that captures the human mind in its entirety. Here we introduce Centaur, a computational model that can predict and simulate human behavior in any experiment expressible in natural l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20268v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20268v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20268v2-abstract-full" style="display: none;"> Establishing a unified theory of cognition has been a major goal of psychology. While there have been previous attempts to instantiate such theories by building computational models, we currently do not have one model that captures the human mind in its entirety. Here we introduce Centaur, a computational model that can predict and simulate human behavior in any experiment expressible in natural language. We derived Centaur by finetuning a state-of-the-art language model on a novel, large-scale data set called Psych-101. Psych-101 reaches an unprecedented scale, covering trial-by-trial data from over 60,000 participants performing over 10,000,000 choices in 160 experiments. Centaur not only captures the behavior of held-out participants better than existing cognitive models, but also generalizes to new cover stories, structural task modifications, and entirely new domains. Furthermore, we find that the model's internal representations become more aligned with human neural activity after finetuning. Taken together, Centaur is the first real candidate for a unified model of human cognition. We anticipate that it will have a disruptive impact on the cognitive sciences, challenging the existing paradigm for developing computational models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20268v2-abstract-full').style.display = 'none'; document.getElementById('2410.20268v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17584">arXiv:2410.17584</a> <span> [<a href="https://arxiv.org/pdf/2410.17584">pdf</a>, <a href="https://arxiv.org/format/2410.17584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring Tokenization Methods for Multitrack Sheet Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yashan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17584v1-abstract-short" style="display: inline;"> This study explores the tokenization of multitrack sheet music in ABC notation, introducing two methods--bar-stream and line-stream patching. We compare these methods against existing techniques, including bar patching, byte patching, and Byte Pair Encoding (BPE). In terms of both computational efficiency and the musicality of the generated compositions, experimental results show that bar-stream p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17584v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17584v1-abstract-full" style="display: none;"> This study explores the tokenization of multitrack sheet music in ABC notation, introducing two methods--bar-stream and line-stream patching. We compare these methods against existing techniques, including bar patching, byte patching, and Byte Pair Encoding (BPE). In terms of both computational efficiency and the musicality of the generated compositions, experimental results show that bar-stream patching performs best overall compared to the others, which makes it a promising tokenization strategy for sheet music generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17584v1-abstract-full').style.display = 'none'; document.getElementById('2410.17584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 1 figure, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17095">arXiv:2410.17095</a> <span> [<a href="https://arxiv.org/pdf/2410.17095">pdf</a>, <a href="https://arxiv.org/format/2410.17095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Inferentially-Private Private Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuaiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuran Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zinan Lin</a>, <a href="/search/cs?searchtype=author&query=Fanti%2C+G">Giulia Fanti</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+S">Zhiwei Steven Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17095v1-abstract-short" style="display: inline;"> Information disclosure can compromise privacy when revealed information is correlated with private information. We consider the notion of inferential privacy, which measures privacy leakage by bounding the inferential power a Bayesian adversary can gain by observing a released signal. Our goal is to devise an inferentially-private private information structure that maximizes the informativeness of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17095v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17095v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17095v1-abstract-full" style="display: none;"> Information disclosure can compromise privacy when revealed information is correlated with private information. We consider the notion of inferential privacy, which measures privacy leakage by bounding the inferential power a Bayesian adversary can gain by observing a released signal. Our goal is to devise an inferentially-private private information structure that maximizes the informativeness of the released signal, following the Blackwell ordering principle, while adhering to inferential privacy constraints. To achieve this, we devise an efficient release mechanism that achieves the inferentially-private Blackwell optimal private information structure for the setting where the private information is binary. Additionally, we propose a programming approach to compute the optimal structure for general cases given the utility function. The design of our mechanisms builds on our geometric characterization of the Blackwell-optimal disclosure mechanisms under privacy constraints, which may be of independent interest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17095v1-abstract-full').style.display = 'none'; document.getElementById('2410.17095v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16770">arXiv:2410.16770</a> <span> [<a href="https://arxiv.org/pdf/2410.16770">pdf</a>, <a href="https://arxiv.org/format/2410.16770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Scene Language: Representing Scenes with Programs, Words, and Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zizhang Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Matt Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangzhe Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajun Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16770v1-abstract-short" style="display: inline;"> We introduce the Scene Language, a visual scene representation that concisely and precisely describes the structure, semantics, and identity of visual scenes. It represents a scene with three key components: a program that specifies the hierarchical and relational structure of entities in the scene, words in natural language that summarize the semantic class of each entity, and embeddings that cap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16770v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16770v1-abstract-full" style="display: none;"> We introduce the Scene Language, a visual scene representation that concisely and precisely describes the structure, semantics, and identity of visual scenes. It represents a scene with three key components: a program that specifies the hierarchical and relational structure of entities in the scene, words in natural language that summarize the semantic class of each entity, and embeddings that capture the visual identity of each entity. This representation can be inferred from pre-trained language models via a training-free inference technique, given text or image inputs. The resulting scene can be rendered into images using traditional, neural, or hybrid graphics renderers. Together, this forms a robust, automated system for high-quality 3D and 4D scene generation. Compared with existing representations like scene graphs, our proposed Scene Language generates complex scenes with higher fidelity, while explicitly modeling the scene structures to enable precise control and editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16770v1-abstract-full').style.display = 'none'; document.getElementById('2410.16770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ai.stanford.edu/~yzzhang/projects/scene-language/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16166">arXiv:2410.16166</a> <span> [<a href="https://arxiv.org/pdf/2410.16166">pdf</a>, <a href="https://arxiv.org/format/2410.16166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Filtering: Adaptive Image-Text Quality Enhancement for MLLM Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Han Huang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijia Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoyu Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16166v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have made significant strides by integrating visual and textual modalities. A critical factor in training MLLMs is the quality of image-text pairs within multimodal pretraining datasets. However, $\textit {de facto}$ filter-based data quality enhancement paradigms often discard a substantial portion of high-quality image data due to inadequate semantic alig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16166v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have made significant strides by integrating visual and textual modalities. A critical factor in training MLLMs is the quality of image-text pairs within multimodal pretraining datasets. However, $\textit {de facto}$ filter-based data quality enhancement paradigms often discard a substantial portion of high-quality image data due to inadequate semantic alignment between images and texts, leading to inefficiencies in data utilization and scalability. In this paper, we propose the Adaptive Image-Text Quality Enhancer (AITQE), a model that dynamically assesses and enhances the quality of image-text pairs. AITQE employs a text rewriting mechanism for low-quality pairs and incorporates a negative sample learning strategy to improve evaluative capabilities by integrating deliberately selected low-quality samples during training. Unlike prior approaches that significantly alter text distributions, our method minimally adjusts text to preserve data volume while enhancing quality. Experimental results demonstrate that AITQE surpasses existing methods on various benchmark, effectively leveraging raw data and scaling efficiently with increasing data volumes. We hope our work will inspire future works. The code and model are available at: https://github.com/hanhuang22/AITQE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16166v1-abstract-full').style.display = 'none'; document.getElementById('2410.16166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15780">arXiv:2410.15780</a> <span> [<a href="https://arxiv.org/pdf/2410.15780">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Efficient System for Automatic Map Storytelling -- A Case Study on Historical Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/cs?searchtype=author&query=Affolter%2C+C">Claudio Affolter</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sidi Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizi Chen</a>, <a href="/search/cs?searchtype=author&query=Hurni%2C+L">Lorenz Hurni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15780v1-abstract-short" style="display: inline;"> Historical maps provide valuable information and knowledge about the past. However, as they often feature non-standard projections, hand-drawn styles, and artistic elements, it is challenging for non-experts to identify and interpret them. While existing image captioning methods have achieved remarkable success on natural images, their performance on maps is suboptimal as maps are underrepresented… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15780v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15780v1-abstract-full" style="display: none;"> Historical maps provide valuable information and knowledge about the past. However, as they often feature non-standard projections, hand-drawn styles, and artistic elements, it is challenging for non-experts to identify and interpret them. While existing image captioning methods have achieved remarkable success on natural images, their performance on maps is suboptimal as maps are underrepresented in their pre-training process. Despite the recent advance of GPT-4 in text recognition and map captioning, it still has a limited understanding of maps, as its performance wanes when texts (e.g., titles and legends) in maps are missing or inaccurate. Besides, it is inefficient or even impractical to fine-tune the model with users' own datasets. To address these problems, we propose a novel and lightweight map-captioning counterpart. Specifically, we fine-tune the state-of-the-art vision-language model CLIP to generate captions relevant to historical maps and enrich the captions with GPT-3.5 to tell a brief story regarding where, what, when and why of a given map. We propose a novel decision tree architecture to only generate captions relevant to the specified map type. Our system shows invariance to text alterations in maps. The system can be easily adapted and extended to other map types and scaled to a larger map captioning system. The code is open-sourced at https://github.com/claudaff/automatic-map-storytelling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15780v1-abstract-full').style.display = 'none'; document.getElementById('2410.15780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15770">arXiv:2410.15770</a> <span> [<a href="https://arxiv.org/pdf/2410.15770">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A roadmap for generative mapping: unlocking the power of generative AI for map-making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sidi Wu</a>, <a href="/search/cs?searchtype=author&query=Henggeler%2C+K">Katharina Henggeler</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizi Chen</a>, <a href="/search/cs?searchtype=author&query=Hurni%2C+L">Lorenz Hurni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15770v1-abstract-short" style="display: inline;"> Maps are broadly relevant across various fields, serving as valuable tools for presenting spatial phenomena and communicating spatial knowledge. However, map-making is still largely confined to those with expertise in GIS and cartography due to the specialized software and complex workflow involved, from data processing to visualization. While generative AI has recently demonstrated its remarkable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15770v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15770v1-abstract-full" style="display: none;"> Maps are broadly relevant across various fields, serving as valuable tools for presenting spatial phenomena and communicating spatial knowledge. However, map-making is still largely confined to those with expertise in GIS and cartography due to the specialized software and complex workflow involved, from data processing to visualization. While generative AI has recently demonstrated its remarkable capability in creating various types of content and its wide accessibility to the general public, its potential in generating maps is yet to be fully realized. This paper highlights the key applications of generative AI in map-making, summarizes recent advancements in generative AI, identifies the specific technologies required and the challenges of using current methods, and provides a roadmap for developing a generative mapping system (GMS) to make map-making more accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15770v1-abstract-full').style.display = 'none'; document.getElementById('2410.15770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15748">arXiv:2410.15748</a> <span> [<a href="https://arxiv.org/pdf/2410.15748">pdf</a>, <a href="https://arxiv.org/format/2410.15748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Alchemy: Amplifying Theorem-Proving Capability through Symbolic Mutation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaonan Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shuai Lu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+P">Ping Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15748v1-abstract-short" style="display: inline;"> Formal proofs are challenging to write even for experienced experts. Recent progress in Neural Theorem Proving (NTP) shows promise in expediting this process. However, the formal corpora available on the Internet are limited compared to the general text, posing a significant data scarcity challenge for NTP. To address this issue, this work proposes Alchemy, a general framework for data synthesis t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15748v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15748v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15748v1-abstract-full" style="display: none;"> Formal proofs are challenging to write even for experienced experts. Recent progress in Neural Theorem Proving (NTP) shows promise in expediting this process. However, the formal corpora available on the Internet are limited compared to the general text, posing a significant data scarcity challenge for NTP. To address this issue, this work proposes Alchemy, a general framework for data synthesis that constructs formal theorems through symbolic mutation. Specifically, for each candidate theorem in Mathlib, we identify all invocable theorems that can be used to rewrite or apply to it. Subsequently, we mutate the candidate theorem by replacing the corresponding term in the statement with its equivalent form or antecedent. As a result, our method increases the number of theorems in Mathlib by an order of magnitude, from 110k to 6M. Furthermore, we perform continual pretraining and supervised finetuning on this augmented corpus for large language models. Experimental results demonstrate the effectiveness of our approach, achieving a 5% absolute performance improvement on Leandojo benchmark. Additionally, our synthetic data achieve a 2.5% absolute performance gain on the out-of-distribution miniF2F benchmark. To provide further insights, we conduct a comprehensive analysis of synthetic data composition and the training paradigm, offering valuable guidance for developing a strong theorem prover. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15748v1-abstract-full').style.display = 'none'; document.getElementById('2410.15748v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15273">arXiv:2410.15273</a> <span> [<a href="https://arxiv.org/pdf/2410.15273">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ArchiTone: A LEGO-Inspired Gamified System for Visualized Music Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiaxing Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Songruoyao Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinda Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingxiao Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15273v1-abstract-short" style="display: inline;"> Participation in music activities has many benefits, but often requires music theory knowledge and aural skills, which can be challenging for beginners. To help them engage more easily, it's crucial to adopt teaching strategies that lower these barriers. Informed by formative investigation and inspired by LEGO, we introduce ArchiTone, a gamified system that employs constructivism by visualizing mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15273v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15273v1-abstract-full" style="display: none;"> Participation in music activities has many benefits, but often requires music theory knowledge and aural skills, which can be challenging for beginners. To help them engage more easily, it's crucial to adopt teaching strategies that lower these barriers. Informed by formative investigation and inspired by LEGO, we introduce ArchiTone, a gamified system that employs constructivism by visualizing music theory concepts as musical blocks and buildings for music education. This system includes two modes: Learning Mode, which involves recognizing and learning common musical blocks through familiar musical works; Creation Mode, which allows learners to freely create and combine musical blocks to produce new musical works. User studies demonstrate that our gamified system is not only more engaging than traditional music education methods but also more effective in helping learners understand abstract music theory and apply it to music praxis. Additionally, learners demonstrate superior performance on music theory tasks after using ArchiTone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15273v1-abstract-full').style.display = 'none'; document.getElementById('2410.15273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13639">arXiv:2410.13639</a> <span> [<a href="https://arxiv.org/pdf/2410.13639">pdf</a>, <a href="https://arxiv.org/format/2410.13639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study on Reasoning Patterns of OpenAI's o1 Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siwei Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhongyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tuney Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialong Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiachen Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wangchunshu Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qunshu Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J+H">J. H. Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13639v2-abstract-short" style="display: inline;"> Enabling Large Language Models (LLMs) to handle a wider range of complex tasks (e.g., coding, math) has drawn great attention from many researchers. As LLMs continue to evolve, merely increasing the number of model parameters yields diminishing performance improvements and heavy computational costs. Recently, OpenAI's o1 model has shown that inference strategies (i.e., Test-time Compute methods) c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13639v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13639v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13639v2-abstract-full" style="display: none;"> Enabling Large Language Models (LLMs) to handle a wider range of complex tasks (e.g., coding, math) has drawn great attention from many researchers. As LLMs continue to evolve, merely increasing the number of model parameters yields diminishing performance improvements and heavy computational costs. Recently, OpenAI's o1 model has shown that inference strategies (i.e., Test-time Compute methods) can also significantly enhance the reasoning capabilities of LLMs. However, the mechanisms behind these methods are still unexplored. In our work, to investigate the reasoning patterns of o1, we compare o1 with existing Test-time Compute methods (BoN, Step-wise BoN, Agent Workflow, and Self-Refine) by using OpenAI's GPT-4o as a backbone on general reasoning benchmarks in three domains (i.e., math, coding, commonsense reasoning). Specifically, first, our experiments show that the o1 model has achieved the best performance on most datasets. Second, as for the methods of searching diverse responses (e.g., BoN), we find the reward models' capability and the search space both limit the upper boundary of these methods. Third, as for the methods that break the problem into many sub-problems, the Agent Workflow has achieved better performance than Step-wise BoN due to the domain-specific system prompt for planning better reasoning processes. Fourth, it is worth mentioning that we have summarized six reasoning patterns of o1, and provided a detailed analysis on several reasoning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13639v2-abstract-full').style.display = 'none'; document.getElementById('2410.13639v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13267">arXiv:2410.13267</a> <span> [<a href="https://arxiv.org/pdf/2410.13267">pdf</a>, <a href="https://arxiv.org/format/2410.13267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLaMP 2: Multimodal Music Information Retrieval Across 101 Languages Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yashan Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhancheng Guo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Monan Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+X">Xuefeng Mu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuejie Gao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuanliang Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiafeng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaobing Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13267v1-abstract-short" style="display: inline;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13267v1-abstract-full" style="display: none;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (Musical Instrument Digital Interface) for music information retrieval. CLaMP 2, pre-trained on 1.5 million ABC-MIDI-text triplets, includes a multilingual text encoder and a multimodal music encoder aligned via contrastive learning. By leveraging large language models, we obtain refined and consistent multilingual descriptions at scale, significantly reducing textual noise and balancing language distribution. Our experiments show that CLaMP 2 achieves state-of-the-art results in both multilingual semantic search and music classification across modalities, thus establishing a new standard for inclusive and global music information retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'none'; document.getElementById('2410.13267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12568">arXiv:2410.12568</a> <span> [<a href="https://arxiv.org/pdf/2410.12568">pdf</a>, <a href="https://arxiv.org/format/2410.12568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robust RL with LLM-Driven Data Synthesis and Policy Adaptation for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sihao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaxu Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xiangyu Yin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xingyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xinping Yi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12568v2-abstract-short" style="display: inline;"> The integration of Large Language Models (LLMs) into autonomous driving systems demonstrates strong common sense and reasoning abilities, effectively addressing the pitfalls of purely data-driven methods. Current LLM-based agents require lengthy inference times and face challenges in interacting with real-time autonomous driving environments. A key open question is whether we can effectively lever… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12568v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12568v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12568v2-abstract-full" style="display: none;"> The integration of Large Language Models (LLMs) into autonomous driving systems demonstrates strong common sense and reasoning abilities, effectively addressing the pitfalls of purely data-driven methods. Current LLM-based agents require lengthy inference times and face challenges in interacting with real-time autonomous driving environments. A key open question is whether we can effectively leverage the knowledge from LLMs to train an efficient and robust Reinforcement Learning (RL) agent. This paper introduces RAPID, a novel \underline{\textbf{R}}obust \underline{\textbf{A}}daptive \underline{\textbf{P}}olicy \underline{\textbf{I}}nfusion and \underline{\textbf{D}}istillation framework, which trains specialized mix-of-policy RL agents using data synthesized by an LLM-based driving agent and online adaptation. RAPID features three key designs: 1) utilization of offline data collected from an LLM agent to distil expert knowledge into RL policies for faster real-time inference; 2) introduction of robust distillation in RL to inherit both performance and robustness from LLM-based teacher; and 3) employment of a mix-of-policy approach for joint decision decoding with a policy adapter. Through fine-tuning via online environment interaction, RAPID reduces the forgetting of LLM knowledge while maintaining adaptability to different tasks. Extensive experiments demonstrate RAPID's capability to effectively integrate LLM knowledge into scaled-down RL policies in an efficient, adaptable, and robust way. Code and checkpoints will be made publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12568v2-abstract-full').style.display = 'none'; document.getElementById('2410.12568v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12376">arXiv:2410.12376</a> <span> [<a href="https://arxiv.org/pdf/2410.12376">pdf</a>, <a href="https://arxiv.org/format/2410.12376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ShapefileGPT: A Multi-Agent Large Language Model Framework for Automated Shapefile Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingming Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaxia Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sensen Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+K">Kai Fang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hailin Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhong Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liuchang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12376v2-abstract-short" style="display: inline;"> Vector data is one of the two core data structures in geographic information science (GIS), essential for accurately storing and representing geospatial information. Shapefile, the most widely used vector data format, has become the industry standard supported by all major geographic information systems. However, processing this data typically requires specialized GIS knowledge and skills, creatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12376v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12376v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12376v2-abstract-full" style="display: none;"> Vector data is one of the two core data structures in geographic information science (GIS), essential for accurately storing and representing geospatial information. Shapefile, the most widely used vector data format, has become the industry standard supported by all major geographic information systems. However, processing this data typically requires specialized GIS knowledge and skills, creating a barrier for researchers from other fields and impeding interdisciplinary research in spatial data analysis. Moreover, while large language models (LLMs) have made significant advancements in natural language processing and task automation, they still face challenges in handling the complex spatial and topological relationships inherent in GIS vector data. To address these challenges, we propose ShapefileGPT, an innovative framework powered by LLMs, specifically designed to automate Shapefile tasks. ShapefileGPT utilizes a multi-agent architecture, in which the planner agent is responsible for task decomposition and supervision, while the worker agent executes the tasks. We developed a specialized function library for handling Shapefiles and provided comprehensive API documentation, enabling the worker agent to operate Shapefiles efficiently through function calling. For evaluation, we developed a benchmark dataset based on authoritative textbooks, encompassing tasks in categories such as geometric operations and spatial queries. ShapefileGPT achieved a task success rate of 95.24%, outperforming the GPT series models. In comparison to traditional LLMs, ShapefileGPT effectively handles complex vector data analysis tasks, overcoming the limitations of traditional LLMs in spatial analysis. This breakthrough opens new pathways for advancing automation and intelligence in the GIS field, with significant potential in interdisciplinary data analysis and application contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12376v2-abstract-full').style.display = 'none'; document.getElementById('2410.12376v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12160">arXiv:2410.12160</a> <span> [<a href="https://arxiv.org/pdf/2410.12160">pdf</a>, <a href="https://arxiv.org/format/2410.12160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> When to Trust Your Data: Enhancing Dyna-Style Model-Based Reinforcement Learning With Data Filter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansong Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zeyu Dong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+E">Ertai Luo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuo Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shuo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12160v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) algorithms can be divided into two classes: model-free algorithms, which are sample-inefficient, and model-based algorithms, which suffer from model bias. Dyna-style algorithms combine these two approaches by using simulated data from an estimated environmental model to accelerate model-free training. However, their efficiency is compromised when the estimated model is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12160v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12160v1-abstract-full" style="display: none;"> Reinforcement learning (RL) algorithms can be divided into two classes: model-free algorithms, which are sample-inefficient, and model-based algorithms, which suffer from model bias. Dyna-style algorithms combine these two approaches by using simulated data from an estimated environmental model to accelerate model-free training. However, their efficiency is compromised when the estimated model is inaccurate. Previous works address this issue by using model ensembles or pretraining the estimated model with data collected from the real environment, increasing computational and sample complexity. To tackle this issue, we introduce an out-of-distribution (OOD) data filter that removes simulated data from the estimated model that significantly diverges from data collected in the real environment. We show theoretically that this technique enhances the quality of simulated data. With the help of the OOD data filter, the data simulated from the estimated model better mimics the data collected by interacting with the real model. This improvement is evident in the critic updates compared to using the simulated data without the OOD data filter. Our experiment integrates the data filter into the model-based policy optimization (MBPO) algorithm. The results demonstrate that our method requires fewer interactions with the real environment to achieve a higher level of optimality than MBPO, even without a model ensemble. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12160v1-abstract-full').style.display = 'none'; document.getElementById('2410.12160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11142">arXiv:2410.11142</a> <span> [<a href="https://arxiv.org/pdf/2410.11142">pdf</a>, <a href="https://arxiv.org/format/2410.11142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> DGRO: Diameter-Guided Ring Optimization for Integrated Research Infrastructure Membership </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shixun Wu</a>, <a href="/search/cs?searchtype=author&query=Raghavan%2C+K">Krishnan Raghavan</a>, <a href="/search/cs?searchtype=author&query=Di%2C+S">Sheng Di</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zizhong Chen</a>, <a href="/search/cs?searchtype=author&query=Cappello%2C+F">Franck Cappello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11142v1-abstract-short" style="display: inline;"> Logical ring is a core component in membership protocol. However, the logic ring fails to consider the underlying physical latency, resulting in a high diameter. To address this issue, we introduce Diameter-Guided Ring Optimization (DGRO), which focuses on constructing rings with the smallest possible diameter, selecting the most effective ring configurations, and implementing these configurations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11142v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11142v1-abstract-full" style="display: none;"> Logical ring is a core component in membership protocol. However, the logic ring fails to consider the underlying physical latency, resulting in a high diameter. To address this issue, we introduce Diameter-Guided Ring Optimization (DGRO), which focuses on constructing rings with the smallest possible diameter, selecting the most effective ring configurations, and implementing these configurations in parallel. We first explore an integration of deep Q-learning and graph embedding to optimize the ring topology. We next propose a ring selection strategy that assesses the current topology's average latency against a global benchmark, facilitating integration into modern peer-to-peer protocols and substantially reducing network diameter. To further enhance scalability, we propose a parallel strategy that distributes the topology construction process into separate partitions simultaneously. Our experiment shows that: 1) DGRO efficiently constructs a network topology that achieves up to a 60% reduction in diameter compared to the best results from an extensive search over $10^5$ topologies, all within a significantly shorter computation time, 2) the ring selection of DGRO reduces the diameter of state-of-the-art methods Chord, RAPID, and Perigee by 10%-40%, 44%, and 60%. 3) the parallel construction can scale up to $32$ partitions while maintaining the same diameter compared to the centralized version. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11142v1-abstract-full').style.display = 'none'; document.getElementById('2410.11142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10114">arXiv:2410.10114</a> <span> [<a href="https://arxiv.org/pdf/2410.10114">pdf</a>, <a href="https://arxiv.org/format/2410.10114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Experts Made Personalized: Federated Prompt Learning for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jun Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shandong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10114v2-abstract-short" style="display: inline;"> Prompt learning for pre-trained Vision-Language Models (VLMs) like CLIP has demonstrated potent applicability across diverse downstream tasks. This lightweight approach has quickly gained traction from federated learning (FL) researchers who seek to efficiently adapt VLMs to heterogeneous scenarios. However, current federated prompt learning methods are habitually restricted to the traditional FL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10114v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10114v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10114v2-abstract-full" style="display: none;"> Prompt learning for pre-trained Vision-Language Models (VLMs) like CLIP has demonstrated potent applicability across diverse downstream tasks. This lightweight approach has quickly gained traction from federated learning (FL) researchers who seek to efficiently adapt VLMs to heterogeneous scenarios. However, current federated prompt learning methods are habitually restricted to the traditional FL paradigm, where the participating clients are generally only allowed to download a single globally aggregated model from the server. While justifiable for training full-sized models under federated settings, in this work, we argue that this paradigm is ill-suited for lightweight prompts. By facilitating the clients to download multiple pre-aggregated prompts as fixed non-local experts, we propose Personalized Federated Mixture of Adaptive Prompts (pFedMoAP), a novel FL framework that personalizes the prompt learning process through the lens of Mixture of Experts (MoE). pFedMoAP implements a local attention-based gating network that learns to generate enhanced text features for better alignment with local image data on the client, benefiting from both local and downloaded non-local adaptive prompt experts. The non-local experts are sparsely selected from a server-maintained pool, fostering collaborative learning across clients. To evaluate the proposed algorithm, we conduct extensive experiments across 9 datasets under various heterogeneous federated settings. The results show that pFedMoAP consistently outperforms the state-of-the-art alternatives, underscoring its efficacy in personalizing prompt learning for CLIP within the federated learning paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10114v2-abstract-full').style.display = 'none'; document.getElementById('2410.10114v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09655">arXiv:2410.09655</a> <span> [<a href="https://arxiv.org/pdf/2410.09655">pdf</a>, <a href="https://arxiv.org/format/2410.09655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Interpolated-MLPs: Controllable Inductive Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sean Wu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jordan Hong</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+K">Keyu Bai</a>, <a href="/search/cs?searchtype=author&query=Bachmann%2C+G">Gregor Bachmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09655v1-abstract-short" style="display: inline;"> Due to their weak inductive bias, Multi-Layer Perceptrons (MLPs) have subpar performance at low-compute levels compared to standard architectures such as convolution-based networks (CNN). Recent work, however, has shown that the performance gap drastically reduces as the amount of compute is increased without changing the amount of inductive bias. In this work, we study the converse: in the low-co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09655v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09655v1-abstract-full" style="display: none;"> Due to their weak inductive bias, Multi-Layer Perceptrons (MLPs) have subpar performance at low-compute levels compared to standard architectures such as convolution-based networks (CNN). Recent work, however, has shown that the performance gap drastically reduces as the amount of compute is increased without changing the amount of inductive bias. In this work, we study the converse: in the low-compute regime, how does the incremental increase of inductive bias affect performance? To quantify inductive bias, we propose a "soft MLP" approach, which we coin Interpolated MLP (I-MLP). We control the amount of inductive bias in the standard MLP by introducing a novel algorithm based on interpolation between fixed weights from a prior model with high inductive bias. We showcase our method using various prior models, including CNNs and the MLP-Mixer architecture. This interpolation scheme allows fractional control of inductive bias, which may be attractive when full inductive bias is not desired (e.g. in the mid-compute regime). We find experimentally that for Vision Tasks in the low-compute regime, there is a continuous and two-sided logarithmic relationship between inductive bias and performance when using CNN and MLP-Mixer prior models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09655v1-abstract-full').style.display = 'none'; document.getElementById('2410.09655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures, ICML HiLD 2024 Workshop: 2nd Workshop on High-dimensional Learning Dynamics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08649">arXiv:2410.08649</a> <span> [<a href="https://arxiv.org/pdf/2410.08649">pdf</a>, <a href="https://arxiv.org/format/2410.08649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> E-Motion: Future Motion Simulation via Event Sequence Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Song Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhiyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Guangming Shi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinjian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08649v1-abstract-short" style="display: inline;"> Forecasting a typical object's future motion is a critical task for interpreting and interacting with dynamic environments in computer vision. Event-based sensors, which could capture changes in the scene with exceptional temporal granularity, may potentially offer a unique opportunity to predict future motion with a level of detail and precision previously unachievable. Inspired by that, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08649v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08649v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08649v1-abstract-full" style="display: none;"> Forecasting a typical object's future motion is a critical task for interpreting and interacting with dynamic environments in computer vision. Event-based sensors, which could capture changes in the scene with exceptional temporal granularity, may potentially offer a unique opportunity to predict future motion with a level of detail and precision previously unachievable. Inspired by that, we propose to integrate the strong learning capacity of the video diffusion model with the rich motion information of an event camera as a motion simulation framework. Specifically, we initially employ pre-trained stable video diffusion models to adapt the event sequence dataset. This process facilitates the transfer of extensive knowledge from RGB videos to an event-centric domain. Moreover, we introduce an alignment mechanism that utilizes reinforcement learning techniques to enhance the reverse generation trajectory of the diffusion model, ensuring improved performance and accuracy. Through extensive testing and validation, we demonstrate the effectiveness of our method in various complex scenarios, showcasing its potential to revolutionize motion flow prediction in computer vision applications such as autonomous vehicle guidance, robotic navigation, and interactive media. Our findings suggest a promising direction for future research in enhancing the interpretative power and predictive accuracy of computer vision systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08649v1-abstract-full').style.display = 'none'; document.getElementById('2410.08649v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08431">arXiv:2410.08431</a> <span> [<a href="https://arxiv.org/pdf/2410.08431">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> oRetrieval Augmented Generation for 10 Large Language Models and its Generalizability in Assessing Medical Fitness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ke%2C+Y+H">Yu He Ke</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Liyuan Jin</a>, <a href="/search/cs?searchtype=author&query=Elangovan%2C+K">Kabilan Elangovan</a>, <a href="/search/cs?searchtype=author&query=Abdullah%2C+H+R">Hairil Rizal Abdullah</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Nan Liu</a>, <a href="/search/cs?searchtype=author&query=Sia%2C+A+T+H">Alex Tiong Heng Sia</a>, <a href="/search/cs?searchtype=author&query=Soh%2C+C+R">Chai Rick Soh</a>, <a href="/search/cs?searchtype=author&query=Tung%2C+J+Y+M">Joshua Yi Min Tung</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+J+C+L">Jasmine Chiat Ling Ong</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+C">Chang-Fu Kuo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shao-Chun Wu</a>, <a href="/search/cs?searchtype=author&query=Kovacheva%2C+V+P">Vesela P. Kovacheva</a>, <a href="/search/cs?searchtype=author&query=Ting%2C+D+S+W">Daniel Shu Wei Ting</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08431v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) show potential for medical applications but often lack specialized clinical knowledge. Retrieval Augmented Generation (RAG) allows customization with domain-specific information, making it suitable for healthcare. This study evaluates the accuracy, consistency, and safety of RAG models in determining fitness for surgery and providing preoperative instructions. We devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08431v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08431v1-abstract-full" style="display: none;"> Large Language Models (LLMs) show potential for medical applications but often lack specialized clinical knowledge. Retrieval Augmented Generation (RAG) allows customization with domain-specific information, making it suitable for healthcare. This study evaluates the accuracy, consistency, and safety of RAG models in determining fitness for surgery and providing preoperative instructions. We developed LLM-RAG models using 35 local and 23 international preoperative guidelines and tested them against human-generated responses. A total of 3,682 responses were evaluated. Clinical documents were processed using Llamaindex, and 10 LLMs, including GPT3.5, GPT4, and Claude-3, were assessed. Fourteen clinical scenarios were analyzed, focusing on seven aspects of preoperative instructions. Established guidelines and expert judgment were used to determine correct responses, with human-generated answers serving as comparisons. The LLM-RAG models generated responses within 20 seconds, significantly faster than clinicians (10 minutes). The GPT4 LLM-RAG model achieved the highest accuracy (96.4% vs. 86.6%, p=0.016), with no hallucinations and producing correct instructions comparable to clinicians. Results were consistent across both local and international guidelines. This study demonstrates the potential of LLM-RAG models for preoperative healthcare tasks, highlighting their efficiency, scalability, and reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08431v1-abstract-full').style.display = 'none'; document.getElementById('2410.08431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2402.01733</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08136">arXiv:2410.08136</a> <span> [<a href="https://arxiv.org/pdf/2410.08136">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> SoundScape: A Human-AI Co-Creation System Making Your Memories Heard </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+C">Chongjun Zhong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiaxing Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yingping Cao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Songruoyao Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenqi Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08136v1-abstract-short" style="display: inline;"> Sound plays a significant role in human memory, yet it is often overlooked by mainstream life-recording methods. Most current UGC (User-Generated Content) creation tools emphasize visual content while lacking user-friendly sound design features. This paper introduces SoundScape, a human-AI co-creation system that allows users to easily create sound memories on mobile devices through innovative int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08136v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08136v1-abstract-full" style="display: none;"> Sound plays a significant role in human memory, yet it is often overlooked by mainstream life-recording methods. Most current UGC (User-Generated Content) creation tools emphasize visual content while lacking user-friendly sound design features. This paper introduces SoundScape, a human-AI co-creation system that allows users to easily create sound memories on mobile devices through innovative interaction. By integrating sound effects and music with visual scenes, SoundScape encourages users to enrich their creations with immersive sound elements, enhancing the atmosphere of their works. To support public creation, SoundScape incorporates a conversational agent and AI music generation technology. User studies indicate that our approach is effective for sound memory creation, with SoundScape outperforming existing tools in terms of user experience and the perceived quality of produced works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08136v1-abstract-full').style.display = 'none'; document.getElementById('2410.08136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07819">arXiv:2410.07819</a> <span> [<a href="https://arxiv.org/pdf/2410.07819">pdf</a>, <a href="https://arxiv.org/format/2410.07819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Uncovering Overfitting in Large Language Model Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaotian Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pengjie Ren</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhumin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07819v1-abstract-short" style="display: inline;"> Knowledge editing has been proposed as an effective method for updating and correcting the internal knowledge of Large Language Models (LLMs). However, existing editing methods often struggle with complex tasks, such as multi-hop reasoning. In this paper, we identify and investigate the phenomenon of Editing Overfit, where edited models assign disproportionately high probabilities to the edit targ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07819v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07819v1-abstract-full" style="display: none;"> Knowledge editing has been proposed as an effective method for updating and correcting the internal knowledge of Large Language Models (LLMs). However, existing editing methods often struggle with complex tasks, such as multi-hop reasoning. In this paper, we identify and investigate the phenomenon of Editing Overfit, where edited models assign disproportionately high probabilities to the edit target, hindering the generalization of new knowledge in complex scenarios. We attribute this issue to the current editing paradigm, which places excessive emphasis on the direct correspondence between the input prompt and the edit target for each edit sample. To further explore this issue, we introduce a new benchmark, EVOKE (EValuation of Editing Overfit in Knowledge Editing), along with fine-grained evaluation metrics. Through comprehensive experiments and analysis, we demonstrate that Editing Overfit is prevalent in current editing methods and that common overfitting mitigation strategies are of limited effectiveness in knowledge editing. To overcome this, inspired by LLMs' knowledge recall mechanisms, we propose a new plug-and-play strategy called Learn to Inference (LTI), which introduce a Multi-stage Inference Constraint module to guide the edited models in recalling new knowledge similarly to how unedited LLMs leverage knowledge through in-context learning. Extensive experimental results across a wide range of tasks validate the effectiveness of LTI in mitigating Editing Overfit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07819v1-abstract-full').style.display = 'none'; document.getElementById('2410.07819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07757">arXiv:2410.07757</a> <span> [<a href="https://arxiv.org/pdf/2410.07757">pdf</a>, <a href="https://arxiv.org/format/2410.07757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMHead: Towards Fine-grained Multi-modal 3D Facial Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sijing Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunhao Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yichao Yan</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+H">Huiyu Duan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07757v1-abstract-short" style="display: inline;"> 3D facial animation has attracted considerable attention due to its extensive applications in the multimedia field. Audio-driven 3D facial animation has been widely explored with promising results. However, multi-modal 3D facial animation, especially text-guided 3D facial animation is rarely explored due to the lack of multi-modal 3D facial animation dataset. To fill this gap, we first construct a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07757v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07757v1-abstract-full" style="display: none;"> 3D facial animation has attracted considerable attention due to its extensive applications in the multimedia field. Audio-driven 3D facial animation has been widely explored with promising results. However, multi-modal 3D facial animation, especially text-guided 3D facial animation is rarely explored due to the lack of multi-modal 3D facial animation dataset. To fill this gap, we first construct a large-scale multi-modal 3D facial animation dataset, MMHead, which consists of 49 hours of 3D facial motion sequences, speech audios, and rich hierarchical text annotations. Each text annotation contains abstract action and emotion descriptions, fine-grained facial and head movements (i.e., expression and head pose) descriptions, and three possible scenarios that may cause such emotion. Concretely, we integrate five public 2D portrait video datasets, and propose an automatic pipeline to 1) reconstruct 3D facial motion sequences from monocular videos; and 2) obtain hierarchical text annotations with the help of AU detection and ChatGPT. Based on the MMHead dataset, we establish benchmarks for two new tasks: text-induced 3D talking head animation and text-to-3D facial motion generation. Moreover, a simple but efficient VQ-VAE-based method named MM2Face is proposed to unify the multi-modal information and generate diverse and plausible 3D facial motions, which achieves competitive results on both benchmarks. Extensive experiments and comprehensive analysis demonstrate the significant potential of our dataset and benchmarks in promoting the development of multi-modal 3D facial animation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07757v1-abstract-full').style.display = 'none'; document.getElementById('2410.07757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACMMM 2024. Project page: https://wsj-sjtu.github.io/MMHead/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07153">arXiv:2410.07153</a> <span> [<a href="https://arxiv.org/pdf/2410.07153">pdf</a>, <a href="https://arxiv.org/format/2410.07153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CHASE: Learning Convex Hull Adaptive Shift for Skeleton-based Multi-Entity Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuhang Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Songtao Wu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Beichen Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07153v1-abstract-short" style="display: inline;"> Skeleton-based multi-entity action recognition is a challenging task aiming to identify interactive actions or group activities involving multiple diverse entities. Existing models for individuals often fall short in this task due to the inherent distribution discrepancies among entity skeletons, leading to suboptimal backbone optimization. To this end, we introduce a Convex Hull Adaptive Shift ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07153v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07153v1-abstract-full" style="display: none;"> Skeleton-based multi-entity action recognition is a challenging task aiming to identify interactive actions or group activities involving multiple diverse entities. Existing models for individuals often fall short in this task due to the inherent distribution discrepancies among entity skeletons, leading to suboptimal backbone optimization. To this end, we introduce a Convex Hull Adaptive Shift based multi-Entity action recognition method (CHASE), which mitigates inter-entity distribution gaps and unbiases subsequent backbones. Specifically, CHASE comprises a learnable parameterized network and an auxiliary objective. The parameterized network achieves plausible, sample-adaptive repositioning of skeleton sequences through two key components. First, the Implicit Convex Hull Constrained Adaptive Shift ensures that the new origin of the coordinate system is within the skeleton convex hull. Second, the Coefficient Learning Block provides a lightweight parameterization of the mapping from skeleton sequences to their specific coefficients in convex combinations. Moreover, to guide the optimization of this network for discrepancy minimization, we propose the Mini-batch Pair-wise Maximum Mean Discrepancy as the additional objective. CHASE operates as a sample-adaptive normalization method to mitigate inter-entity distribution discrepancies, thereby reducing data bias and improving the subsequent classifier's multi-entity action recognition performance. Extensive experiments on six datasets, including NTU Mutual 11/26, H2O, Assembly101, Collective Activity and Volleyball, consistently verify our approach by seamlessly adapting to single-entity backbones and boosting their performance in multi-entity scenarios. Our code is publicly available at https://github.com/Necolizer/CHASE . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07153v1-abstract-full').style.display = 'none'; document.getElementById('2410.07153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Camera-ready Version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06934">arXiv:2410.06934</a> <span> [<a href="https://arxiv.org/pdf/2410.06934">pdf</a>, <a href="https://arxiv.org/format/2410.06934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> VEC-Sim: A Simulation Platform for Evaluating Service Caching and Computation Offloading Policies in Vehicular Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolong Xu</a>, <a href="/search/cs?searchtype=author&query=Bilal%2C+M">Muhammad Bilal</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangwei Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06934v1-abstract-short" style="display: inline;"> Computer simulation platforms offer an alternative solution by emulating complex systems in a controlled manner. However, existing Edge Computing (EC) simulators, as well as general-purpose vehicular network simulators, are not tailored for VEC and lack dedicated support for modeling the distinct access pattern, entity mobility trajectory and other unique characteristics of VEC networks. To fill t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06934v1-abstract-full" style="display: none;"> Computer simulation platforms offer an alternative solution by emulating complex systems in a controlled manner. However, existing Edge Computing (EC) simulators, as well as general-purpose vehicular network simulators, are not tailored for VEC and lack dedicated support for modeling the distinct access pattern, entity mobility trajectory and other unique characteristics of VEC networks. To fill this gap, this paper proposes VEC-Sim, a versatile simulation platform for in-depth evaluation and analysis of various service caching and computation offloading policies in VEC networks. VEC-Sim incorporates realistic mechanisms to replicate real-world access patterns, including service feature vector, vehicle mobility modeling, evolving service popularity, new service upload and user preference shifts, etc. Moreover, its modular architecture and extensive Application Programming Interfaces (APIs) allow seamless integration of customized scheduling policies and user-defined metrics. A comprehensive evaluation of VEC-Sim's capabilities is undertaken in comparison to real-world ground truths. Results prove it to be accurate in reproducing classical scheduling algorithms and extremely effective in conducting case studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06934v1-abstract-full').style.display = 'none'; document.getElementById('2410.06934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository