CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 769 results for author: <span class="mathjax">Yao, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yao%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yao, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yao%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yao, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12734">arXiv:2411.12734</a> <span> [<a href="https://arxiv.org/pdf/2411.12734">pdf</a>, <a href="https://arxiv.org/format/2411.12734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Soft Robotic Dynamic In-Hand Pen Spinning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yunchao Yao</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+U">Uksang Yoo</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jean Oh</a>, <a href="/search/cs?searchtype=author&query=Atkeson%2C+C+G">Christopher G. Atkeson</a>, <a href="/search/cs?searchtype=author&query=Ichnowski%2C+J">Jeffrey Ichnowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12734v1-abstract-short" style="display: inline;"> Dynamic in-hand manipulation remains a challenging task for soft robotic systems that have demonstrated advantages in safe compliant interactions but struggle with high-speed dynamic tasks. In this work, we present SWIFT, a system for learning dynamic tasks using a soft and compliant robotic hand. Unlike previous works that rely on simulation, quasi-static actions and precise object models, the pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12734v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12734v1-abstract-full" style="display: none;"> Dynamic in-hand manipulation remains a challenging task for soft robotic systems that have demonstrated advantages in safe compliant interactions but struggle with high-speed dynamic tasks. In this work, we present SWIFT, a system for learning dynamic tasks using a soft and compliant robotic hand. Unlike previous works that rely on simulation, quasi-static actions and precise object models, the proposed system learns to spin a pen through trial-and-error using only real-world data without requiring explicit prior knowledge of the pen's physical attributes. With self-labeled trials sampled from the real world, the system discovers the set of pen grasping and spinning primitive parameters that enables a soft hand to spin a pen robustly and reliably. After 130 sampled actions per object, SWIFT achieves 100% success rate across three pens with different weights and weight distributions, demonstrating the system's generalizability and robustness to changes in object properties. The results highlight the potential for soft robotic end-effectors to perform dynamic tasks including rapid in-hand manipulation. We also demonstrate that SWIFT generalizes to spinning items with different shapes and weights such as a brush and a screwdriver which we spin with 10/10 and 5/10 success rates respectively. Videos, data, and code are available at https://soft-spin.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12734v1-abstract-full').style.display = 'none'; document.getElementById('2411.12734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09209">arXiv:2411.09209</a> <span> [<a href="https://arxiv.org/pdf/2411.09209">pdf</a>, <a href="https://arxiv.org/format/2411.09209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JoyVASA: Portrait and Animal Image Animation with Diffusion-Based Audio-Driven Facial Dynamics and Head Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuyang Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sheng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yang Yao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+J">Jintao Fei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09209v3-abstract-short" style="display: inline;"> Audio-driven portrait animation has made significant advances with diffusion-based models, improving video quality and lipsync accuracy. However, the increasing complexity of these models has led to inefficiencies in training and inference, as well as constraints on video length and inter-frame continuity. In this paper, we propose JoyVASA, a diffusion-based method for generating facial dynamics a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09209v3-abstract-full').style.display = 'inline'; document.getElementById('2411.09209v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09209v3-abstract-full" style="display: none;"> Audio-driven portrait animation has made significant advances with diffusion-based models, improving video quality and lipsync accuracy. However, the increasing complexity of these models has led to inefficiencies in training and inference, as well as constraints on video length and inter-frame continuity. In this paper, we propose JoyVASA, a diffusion-based method for generating facial dynamics and head motion in audio-driven facial animation. Specifically, in the first stage, we introduce a decoupled facial representation framework that separates dynamic facial expressions from static 3D facial representations. This decoupling allows the system to generate longer videos by combining any static 3D facial representation with dynamic motion sequences. Then, in the second stage, a diffusion transformer is trained to generate motion sequences directly from audio cues, independent of character identity. Finally, a generator trained in the first stage uses the 3D facial representation and the generated motion sequences as inputs to render high-quality animations. With the decoupled facial representation and the identity-independent motion generation process, JoyVASA extends beyond human portraits to animate animal faces seamlessly. The model is trained on a hybrid dataset of private Chinese and public English data, enabling multilingual support. Experimental results validate the effectiveness of our approach. Future work will focus on improving real-time performance and refining expression control, further expanding the applications in portrait animation. The code is available at: https://github.com/jdh-algo/JoyVASA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09209v3-abstract-full').style.display = 'none'; document.getElementById('2411.09209v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07215">arXiv:2411.07215</a> <span> [<a href="https://arxiv.org/pdf/2411.07215">pdf</a>, <a href="https://arxiv.org/ps/2411.07215">ps</a>, <a href="https://arxiv.org/format/2411.07215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Semantic Logical Relations for Timed Message-Passing Protocols (Extended Version) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yue Yao</a>, <a href="/search/cs?searchtype=author&query=Iraci%2C+G">Grant Iraci</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+C">Cheng-En Chuang</a>, <a href="/search/cs?searchtype=author&query=Balzer%2C+S">Stephanie Balzer</a>, <a href="/search/cs?searchtype=author&query=Ziarek%2C+L">Lukasz Ziarek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07215v1-abstract-short" style="display: inline;"> Many of today's message-passing systems not only require messages to be exchanged in a certain order but also to happen at a certain \emph{time} or within a certain \emph{time window}. Such correctness conditions are particularly prominent in Internet of Things (IoT) and real-time systems applications, which interface with hardware devices that come with inherent timing constraints. Verifying comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07215v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07215v1-abstract-full" style="display: none;"> Many of today's message-passing systems not only require messages to be exchanged in a certain order but also to happen at a certain \emph{time} or within a certain \emph{time window}. Such correctness conditions are particularly prominent in Internet of Things (IoT) and real-time systems applications, which interface with hardware devices that come with inherent timing constraints. Verifying compliance of such systems with the intended \emph{timed protocol} is challenged by their \emph{heterogeneity} -- ruling out any verification method that relies on the system to be implemented in one common language, let alone in a high-level and typed programming language. To address this challenge, this paper contributes a \emph{logical relation} to verify that its inhabitants (the applications and hardware devices to be proved correct) comply with the given timed protocol. To cater to the systems' heterogeneity, the logical relation is entirely \emph{semantic}, lifting the requirement that its inhabitants are syntactically well-typed. A semantic approach enables two modes of use of the logical relation for program verification: (i) \emph{once-and-for-all} verification of an \emph{arbitrary} well-typed application, given a type system, and (ii) \emph{per-instance} verification of a specific application / hardware device (a.k.a. foreign code). To facilitate mode (i), the paper develops a refinement type system for expressing timed message-passing protocols and proves that any well-typed program inhabits the logical relation (fundamental theorem). A type checker for the refinement type system has been implemented in Rust, using an SMT solver to check satisfiability of timing constraints. Then, the paper demonstrates both modes of use based on a small case study of a smart home system for monitoring air quality, consisting of a controller application and various environment sensors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07215v1-abstract-full').style.display = 'none'; document.getElementById('2411.07215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06959">arXiv:2411.06959</a> <span> [<a href="https://arxiv.org/pdf/2411.06959">pdf</a>, <a href="https://arxiv.org/format/2411.06959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ENAT: Rethinking Spatial-temporal Interactions in Token-based Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+Z">Zanlin Ni</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Renping Zhou</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yizeng Han</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiayi Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Gao Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06959v1-abstract-short" style="display: inline;"> Recently, token-based generation have demonstrated their effectiveness in image synthesis. As a representative example, non-autoregressive Transformers (NATs) can generate decent-quality images in a few steps. NATs perform generation in a progressive manner, where the latent tokens of a resulting image are incrementally revealed. At each step, the unrevealed image regions are padded with mask toke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06959v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06959v1-abstract-full" style="display: none;"> Recently, token-based generation have demonstrated their effectiveness in image synthesis. As a representative example, non-autoregressive Transformers (NATs) can generate decent-quality images in a few steps. NATs perform generation in a progressive manner, where the latent tokens of a resulting image are incrementally revealed. At each step, the unrevealed image regions are padded with mask tokens and inferred by NAT. In this paper, we delve into the mechanisms behind the effectiveness of NATs and uncover two important patterns that naturally emerge from NATs: Spatially (within a step), although mask and visible tokens are processed uniformly by NATs, the interactions between them are highly asymmetric. In specific, mask tokens mainly gather information for decoding, while visible tokens tend to primarily provide information, and their deep representations can be built only upon themselves. Temporally (across steps), the interactions between adjacent generation steps mostly concentrate on updating the representations of a few critical tokens, while the computation for the majority of tokens is generally repetitive. Driven by these findings, we propose EfficientNAT (ENAT), a NAT model that explicitly encourages these critical interactions inherent in NATs. At the spatial level, we disentangle the computations of visible and mask tokens by encoding visible tokens independently, while decoding mask tokens conditioned on the fully encoded visible tokens. At the temporal level, we prioritize the computation of the critical tokens at each step, while maximally reusing previously computed token representations to supplement necessary information. ENAT improves the performance of NATs notably with significantly reduced computational cost. Experiments on ImageNet-256, ImageNet-512 and MS-COCO validate the effectiveness of ENAT. Code is available at https://github.com/LeapLabTHU/ENAT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06959v1-abstract-full').style.display = 'none'; document.getElementById('2411.06959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06427">arXiv:2411.06427</a> <span> [<a href="https://arxiv.org/pdf/2411.06427">pdf</a>, <a href="https://arxiv.org/format/2411.06427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UniGAD: Unifying Multi-level Graph Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiqing Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jianheng Tang</a>, <a href="/search/cs?searchtype=author&query=Zi%2C+C">Chenyi Zi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H+V">H. Vicky Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06427v1-abstract-short" style="display: inline;"> Graph Anomaly Detection (GAD) aims to identify uncommon, deviated, or suspicious objects within graph-structured data. Existing methods generally focus on a single graph object type (node, edge, graph, etc.) and often overlook the inherent connections among different object types of graph anomalies. For instance, a money laundering transaction might involve an abnormal account and the broader comm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06427v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06427v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06427v1-abstract-full" style="display: none;"> Graph Anomaly Detection (GAD) aims to identify uncommon, deviated, or suspicious objects within graph-structured data. Existing methods generally focus on a single graph object type (node, edge, graph, etc.) and often overlook the inherent connections among different object types of graph anomalies. For instance, a money laundering transaction might involve an abnormal account and the broader community it interacts with. To address this, we present UniGAD, the first unified framework for detecting anomalies at node, edge, and graph levels jointly. Specifically, we develop the Maximum Rayleigh Quotient Subgraph Sampler (MRQSampler) that unifies multi-level formats by transferring objects at each level into graph-level tasks on subgraphs. We theoretically prove that MRQSampler maximizes the accumulated spectral energy of subgraphs (i.e., the Rayleigh quotient) to preserve the most significant anomaly information. To further unify multi-level training, we introduce a novel GraphStitch Network to integrate information across different levels, adjust the amount of sharing required at each level, and harmonize conflicting training goals. Comprehensive experiments show that UniGAD outperforms both existing GAD methods specialized for a single task and graph prompt-based approaches for multiple tasks, while also providing robust zero-shot task transferability. All codes can be found at https://github.com/lllyyq1121/UniGAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06427v1-abstract-full').style.display = 'none'; document.getElementById('2411.06427v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. All codes can be found at https://github.com/lllyyq1121/UniGAD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05902">arXiv:2411.05902</a> <span> [<a href="https://arxiv.org/pdf/2411.05902">pdf</a>, <a href="https://arxiv.org/format/2411.05902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Models in Vision: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jing Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gongye Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lun Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengyue Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Taiqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shen Yan</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sapiro%2C+G">Guillermo Sapiro</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+N">Ngai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05902v1-abstract-short" style="display: inline;"> Autoregressive modeling has been a huge success in the field of natural language processing (NLP). Recently, autoregressive models have emerged as a significant area of focus in computer vision, where they excel in producing high-quality visual content. Autoregressive models in NLP typically operate on subword tokens. However, the representation strategy in computer vision can vary in different le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05902v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05902v1-abstract-full" style="display: none;"> Autoregressive modeling has been a huge success in the field of natural language processing (NLP). Recently, autoregressive models have emerged as a significant area of focus in computer vision, where they excel in producing high-quality visual content. Autoregressive models in NLP typically operate on subword tokens. However, the representation strategy in computer vision can vary in different levels, \textit{i.e.}, pixel-level, token-level, or scale-level, reflecting the diverse and hierarchical nature of visual data compared to the sequential structure of language. This survey comprehensively examines the literature on autoregressive models applied to vision. To improve readability for researchers from diverse research backgrounds, we start with preliminary sequence representation and modeling in vision. Next, we divide the fundamental frameworks of visual autoregressive models into three general sub-categories, including pixel-based, token-based, and scale-based models based on the strategy of representation. We then explore the interconnections between autoregressive models and other generative models. Furthermore, we present a multi-faceted categorization of autoregressive models in computer vision, including image generation, video generation, 3D generation, and multi-modal generation. We also elaborate on their applications in diverse domains, including emerging domains such as embodied AI and 3D medical AI, with about 250 related references. Finally, we highlight the current challenges to autoregressive models in vision with suggestions about potential research directions. We have also set up a Github repository to organize the papers included in this survey at: \url{https://github.com/ChaofanTao/Autoregressive-Models-in-Vision-Survey}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05902v1-abstract-full').style.display = 'none'; document.getElementById('2411.05902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05748">arXiv:2411.05748</a> <span> [<a href="https://arxiv.org/pdf/2411.05748">pdf</a>, <a href="https://arxiv.org/format/2411.05748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Multi-Dimensional Reconfigurable, Physically Composable Hybrid Diffractive Optical Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yu Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05748v1-abstract-short" style="display: inline;"> Diffractive optical neural networks (DONNs), leveraging free-space light wave propagation for ultra-parallel, high-efficiency computing, have emerged as promising artificial intelligence (AI) accelerators. However, their inherent lack of reconfigurability due to fixed optical structures post-fabrication hinders practical deployment in the face of dynamic AI workloads and evolving applications. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05748v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05748v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05748v1-abstract-full" style="display: none;"> Diffractive optical neural networks (DONNs), leveraging free-space light wave propagation for ultra-parallel, high-efficiency computing, have emerged as promising artificial intelligence (AI) accelerators. However, their inherent lack of reconfigurability due to fixed optical structures post-fabrication hinders practical deployment in the face of dynamic AI workloads and evolving applications. To overcome this challenge, we introduce, for the first time, a multi-dimensional reconfigurable hybrid diffractive ONN system (MDR-HDONN), a physically composable architecture that unlocks a new degree of freedom and unprecedented versatility in DONNs. By leveraging full-system learnability, MDR-HDONN repurposes fixed fabricated optical hardware, achieving exponentially expanded functionality and superior task adaptability through the differentiable learning of system variables. Furthermore, MDR-HDONN adopts a hybrid optical/photonic design, combining the reconfigurability of integrated photonics with the ultra-parallelism of free-space diffractive systems. Extensive evaluations demonstrate that MDR-HDONN has digital-comparable accuracy on various task adaptations with 74x faster speed and 194x lower energy. Compared to prior DONNs, MDR-HDONN shows exponentially larger functional space with 5x faster training speed, paving the way for a new paradigm of versatile, composable, hybrid optical/photonic AI computing. We will open-source our codes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05748v1-abstract-full').style.display = 'none'; document.getElementById('2411.05748v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05281">arXiv:2411.05281</a> <span> [<a href="https://arxiv.org/pdf/2411.05281">pdf</a>, <a href="https://arxiv.org/format/2411.05281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fox-1 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zijian Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+R">Rui Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaozhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shanshan Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Han Jin</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+A+D">Alay Dilipbhai Shah</a>, <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuhang Yao</a>, <a href="/search/cs?searchtype=author&query=Avestimehr%2C+S">Salman Avestimehr</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Chaoyang He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05281v2-abstract-short" style="display: inline;"> We present Fox-1, a series of small language models (SLMs) consisting of Fox-1-1.6B and Fox-1-1.6B-Instruct-v0.1. These models are pre-trained on 3 trillion tokens of web-scraped document data and fine-tuned with 5 billion tokens of instruction-following and multi-turn conversation data. Aiming to improve the pre-training efficiency, Fox-1-1.6B model introduces a novel 3-stage data curriculum acro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05281v2-abstract-full').style.display = 'inline'; document.getElementById('2411.05281v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05281v2-abstract-full" style="display: none;"> We present Fox-1, a series of small language models (SLMs) consisting of Fox-1-1.6B and Fox-1-1.6B-Instruct-v0.1. These models are pre-trained on 3 trillion tokens of web-scraped document data and fine-tuned with 5 billion tokens of instruction-following and multi-turn conversation data. Aiming to improve the pre-training efficiency, Fox-1-1.6B model introduces a novel 3-stage data curriculum across all the training data with 2K-8K sequence length. In architecture design, Fox-1 features a deeper layer structure, an expanded vocabulary, and utilizes Grouped Query Attention (GQA), offering a performant and efficient architecture compared to other SLMs. Fox-1 achieves better or on-par performance in various benchmarks compared to StableLM-2-1.6B, Gemma-2B, Qwen1.5-1.8B, and OpenELM1.1B, with competitive inference speed and throughput. The model weights have been released under the Apache 2.0 license, where we aim to promote the democratization of LLMs and make them fully accessible to the whole open-source community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05281v2-abstract-full').style.display = 'none'; document.getElementById('2411.05281v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Base model is available at https://huggingface.co/tensoropera/Fox-1-1.6B and the instruction-tuned version is available at https://huggingface.co/tensoropera/Fox-1-1.6B-Instruct-v0.1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05209">arXiv:2411.05209</a> <span> [<a href="https://arxiv.org/pdf/2411.05209">pdf</a>, <a href="https://arxiv.org/format/2411.05209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Alopex: A Computational Framework for Enabling On-Device Function Calls with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ran%2C+Y">Yide Ran</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaozhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuhang Yao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zijian Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shanshan Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Han Jin</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+A+D">Alay Dilipbhai Shah</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Avestimehr%2C+S">Salman Avestimehr</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Chaoyang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05209v1-abstract-short" style="display: inline;"> The rapid advancement of Large Language Models (LLMs) has led to their increased integration into mobile devices for personalized assistance, which enables LLMs to call external API functions to enhance their performance. However, challenges such as data scarcity, ineffective question formatting, and catastrophic forgetting hinder the development of on-device LLM agents. To tackle these issues, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05209v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05209v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05209v1-abstract-full" style="display: none;"> The rapid advancement of Large Language Models (LLMs) has led to their increased integration into mobile devices for personalized assistance, which enables LLMs to call external API functions to enhance their performance. However, challenges such as data scarcity, ineffective question formatting, and catastrophic forgetting hinder the development of on-device LLM agents. To tackle these issues, we propose Alopex, a framework that enables precise on-device function calls using the Fox LLM. Alopex introduces a logic-based method for generating high-quality training data and a novel ``description-question-output'' format for fine-tuning, reducing risks of function information leakage. Additionally, a data mixing strategy is used to mitigate catastrophic forgetting, combining function call data with textbook datasets to enhance performance in various tasks. Experimental results show that Alopex improves function call accuracy and significantly reduces catastrophic forgetting, providing a robust solution for integrating function call capabilities into LLMs without manual intervention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05209v1-abstract-full').style.display = 'none'; document.getElementById('2411.05209v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04576">arXiv:2411.04576</a> <span> [<a href="https://arxiv.org/pdf/2411.04576">pdf</a>, <a href="https://arxiv.org/format/2411.04576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> "I Always Felt that Something Was Wrong.": Understanding Compliance Risks and Mitigation Strategies when Professionals Use Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Siying Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Piaohong Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yaxing Yao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhicong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04576v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been increasingly adopted by professionals for work tasks. However, using LLMs also introduces compliance risks relating to privacy, ethics, and regulations. This study investigated the compliance risks professionals perceive with LLM use and their risk mitigation strategies. Semi-structured interviews were conducted with 24 law, healthcare, and academia professio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04576v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04576v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04576v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have been increasingly adopted by professionals for work tasks. However, using LLMs also introduces compliance risks relating to privacy, ethics, and regulations. This study investigated the compliance risks professionals perceive with LLM use and their risk mitigation strategies. Semi-structured interviews were conducted with 24 law, healthcare, and academia professionals. Results showed that the main compliance concerns centered around potential exposure to sensitive customer/patient information through LLMs. To address risks, professionals reported proactively inputting distorted data to preserve privacy. However, full compliance proved challenging, given the complex interactions between user inputs, LLM behaviors, and regulations. This research provides valuable insights into designing LLMs with built-in privacy and risk controls to support professionals' evaluation and adoption of emerging AI technologies while meeting compliance obligations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04576v1-abstract-full').style.display = 'none'; document.getElementById('2411.04576v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03068">arXiv:2411.03068</a> <span> [<a href="https://arxiv.org/pdf/2411.03068">pdf</a>, <a href="https://arxiv.org/format/2411.03068">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Alpha and Prejudice: Improving $伪$-sized Worst-case Fairness via Intrinsic Reweighting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yinghua Yao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yuangang Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuanqian Wang</a>, <a href="/search/cs?searchtype=author&query=Tsang%2C+I+W">Ivor W. Tsang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiuju Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03068v1-abstract-short" style="display: inline;"> Worst-case fairness with off-the-shelf demographics achieves group parity by maximizing the model utility of the worst-off group. Nevertheless, demographic information is often unavailable in practical scenarios, which impedes the use of such a direct max-min formulation. Recent advances have reframed this learning problem by introducing the lower bound of minimal partition ratio, denoted as $伪$,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03068v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03068v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03068v1-abstract-full" style="display: none;"> Worst-case fairness with off-the-shelf demographics achieves group parity by maximizing the model utility of the worst-off group. Nevertheless, demographic information is often unavailable in practical scenarios, which impedes the use of such a direct max-min formulation. Recent advances have reframed this learning problem by introducing the lower bound of minimal partition ratio, denoted as $伪$, as side information, referred to as ``$伪$-sized worst-case fairness'' in this paper. We first justify the practical significance of this setting by presenting noteworthy evidence from the data privacy perspective, which has been overlooked by existing research. Without imposing specific requirements on loss functions, we propose reweighting the training samples based on their intrinsic importance to fairness. Given the global nature of the worst-case formulation, we further develop a stochastic learning scheme to simplify the training process without compromising model performance. Additionally, we address the issue of outliers and provide a robust variant to handle potential outliers during model training. Our theoretical analysis and experimental observations reveal the connections between the proposed approaches and existing ``fairness-through-reweighting'' studies, with extensive experimental results on fairness benchmarks demonstrating the superiority of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03068v1-abstract-full').style.display = 'none'; document.getElementById('2411.03068v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02149">arXiv:2411.02149</a> <span> [<a href="https://arxiv.org/pdf/2411.02149">pdf</a>, <a href="https://arxiv.org/format/2411.02149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-72691-0_11">10.1007/978-3-031-72691-0_11 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Improving Domain Generalization in Self-supervised Monocular Depth Estimation via Stabilized Adversarial Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuanqi Yao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Gang Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kui Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siao Liu</a>, <a href="/search/cs?searchtype=author&query=Kuai%2C+J">Jian Kuai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xianming Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junjun Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02149v2-abstract-short" style="display: inline;"> Learning a self-supervised Monocular Depth Estimation (MDE) model with great generalization remains significantly challenging. Despite the success of adversarial augmentation in the supervised learning generalization, naively incorporating it into self-supervised MDE models potentially causes over-regularization, suffering from severe performance degradation. In this paper, we conduct qualitative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02149v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02149v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02149v2-abstract-full" style="display: none;"> Learning a self-supervised Monocular Depth Estimation (MDE) model with great generalization remains significantly challenging. Despite the success of adversarial augmentation in the supervised learning generalization, naively incorporating it into self-supervised MDE models potentially causes over-regularization, suffering from severe performance degradation. In this paper, we conduct qualitative analysis and illuminate the main causes: (i) inherent sensitivity in the UNet-alike depth network and (ii) dual optimization conflict caused by over-regularization. To tackle these issues, we propose a general adversarial training framework, named Stabilized Conflict-optimization Adversarial Training (SCAT), integrating adversarial data augmentation into self-supervised MDE methods to achieve a balance between stability and generalization. Specifically, we devise an effective scaling depth network that tunes the coefficients of long skip connection and effectively stabilizes the training process. Then, we propose a conflict gradient surgery strategy, which progressively integrates the adversarial gradient and optimizes the model toward a conflict-free direction. Extensive experiments on five benchmarks demonstrate that SCAT can achieve state-of-the-art performance and significantly improve the generalization capability of existing self-supervised MDE methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02149v2-abstract-full').style.display = 'none'; document.getElementById('2411.02149v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00053">arXiv:2411.00053</a> <span> [<a href="https://arxiv.org/pdf/2411.00053">pdf</a>, <a href="https://arxiv.org/format/2411.00053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ACC-Debate: An Actor-Critic Approach to Multi-Agent Debate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Estornell%2C+A">Andrew Estornell</a>, <a href="/search/cs?searchtype=author&query=Ton%2C+J">Jean-Francois Ton</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuanshun Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00053v2-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated a remarkable ability to serve as general-purpose tools for various language-based tasks. Recent works have demonstrated that the efficacy of such models can be improved through iterative dialog between multiple models, frequently referred to as multi-agent debate (MAD). While debate shows promise as a means of improving model efficacy, most works in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00053v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00053v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00053v2-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated a remarkable ability to serve as general-purpose tools for various language-based tasks. Recent works have demonstrated that the efficacy of such models can be improved through iterative dialog between multiple models, frequently referred to as multi-agent debate (MAD). While debate shows promise as a means of improving model efficacy, most works in this area treat debate as an emergent behavior, rather than a learned behavior. In doing so, current debate frameworks rely on collaborative behaviors to have been sufficiently trained into off-the-shelf models. To address this limitation, we propose ACC-Debate, an Actor-Critic based learning framework to produce a two-agent team specialized in debate. We demonstrate that ACC-Debate outperforms SotA debate techniques on a wide array of benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00053v2-abstract-full').style.display = 'none'; document.getElementById('2411.00053v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22551">arXiv:2410.22551</a> <span> [<a href="https://arxiv.org/pdf/2410.22551">pdf</a>, <a href="https://arxiv.org/format/2410.22551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FairSkin: Fair Diffusion for Skin Disease Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuguang Yao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingtong Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sijia Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22551v2-abstract-short" style="display: inline;"> Image generation is a prevailing technique for clinical data augmentation for advancing diagnostic accuracy and reducing healthcare disparities. Diffusion Model (DM) has become a leading method in generating synthetic medical images, but it suffers from a critical twofold bias: (1) The quality of images generated for Caucasian individuals is significantly higher, as measured by the Frechet Incepti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22551v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22551v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22551v2-abstract-full" style="display: none;"> Image generation is a prevailing technique for clinical data augmentation for advancing diagnostic accuracy and reducing healthcare disparities. Diffusion Model (DM) has become a leading method in generating synthetic medical images, but it suffers from a critical twofold bias: (1) The quality of images generated for Caucasian individuals is significantly higher, as measured by the Frechet Inception Distance (FID). (2) The ability of the downstream-task learner to learn critical features from disease images varies across different skin tones. These biases pose significant risks, particularly in skin disease detection, where underrepresentation of certain skin tones can lead to misdiagnosis or neglect of specific conditions. To address these challenges, we propose FairSkin, a novel DM framework that mitigates these biases through a three-level resampling mechanism, ensuring fairer representation across racial and disease categories. Our approach significantly improves the diversity and quality of generated images, contributing to more equitable skin disease detection in clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22551v2-abstract-full').style.display = 'none'; document.getElementById('2410.22551v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20957">arXiv:2410.20957</a> <span> [<a href="https://arxiv.org/pdf/2410.20957">pdf</a>, <a href="https://arxiv.org/format/2410.20957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neuro-symbolic Learning Yielding Logical Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yunpeng Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Taolue Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaoxing Ma</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20957v1-abstract-short" style="display: inline;"> Neuro-symbolic systems combine the abilities of neural perception and logical reasoning. However, end-to-end learning of neuro-symbolic systems is still an unsolved challenge. This paper proposes a natural framework that fuses neural network training, symbol grounding, and logical constraint synthesis into a coherent and efficient end-to-end learning process. The capability of this framework comes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20957v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20957v1-abstract-full" style="display: none;"> Neuro-symbolic systems combine the abilities of neural perception and logical reasoning. However, end-to-end learning of neuro-symbolic systems is still an unsolved challenge. This paper proposes a natural framework that fuses neural network training, symbol grounding, and logical constraint synthesis into a coherent and efficient end-to-end learning process. The capability of this framework comes from the improved interactions between the neural and the symbolic parts of the system in both the training and inference stages. Technically, to bridge the gap between the continuous neural network and the discrete logical constraint, we introduce a difference-of-convex programming technique to relax the logical constraints while maintaining their precision. We also employ cardinality constraints as the language for logical constraint learning and incorporate a trust region method to avoid the degeneracy of logical constraint in learning. Both theoretical analyses and empirical evaluations substantiate the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20957v1-abstract-full').style.display = 'none'; document.getElementById('2410.20957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2023, and code is available at [this url](https://github.com/Lizn-zn/Nesy-Programming)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20164">arXiv:2410.20164</a> <span> [<a href="https://arxiv.org/pdf/2410.20164">pdf</a>, <a href="https://arxiv.org/format/2410.20164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt Diffusion Robustifies Any-Modality Prompt Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yuzhang Shang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuguang Yao</a>, <a href="/search/cs?searchtype=author&query=Kompella%2C+R">Ramana Kompella</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20164v1-abstract-short" style="display: inline;"> Foundation models enable prompt-based classifiers for zero-shot and few-shot learning. Nonetheless, the conventional method of employing fixed prompts suffers from distributional shifts that negatively impact generalizability to unseen samples. This paper introduces prompt diffusion, which uses a diffusion model to gradually refine the prompts to obtain a customized prompt for each sample. Specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20164v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20164v1-abstract-full" style="display: none;"> Foundation models enable prompt-based classifiers for zero-shot and few-shot learning. Nonetheless, the conventional method of employing fixed prompts suffers from distributional shifts that negatively impact generalizability to unseen samples. This paper introduces prompt diffusion, which uses a diffusion model to gradually refine the prompts to obtain a customized prompt for each sample. Specifically, we first optimize a collection of prompts to obtain over-fitted prompts per sample. Then, we propose a prompt diffusion model within the prompt space, enabling the training of a generative transition process from a random prompt to its overfitted prompt. As we cannot access the label of a test image during inference, our model gradually generates customized prompts solely from random prompts using our trained, prompt diffusion. Our prompt diffusion is generic, flexible, and modality-agnostic, making it a simple plug-and-play module seamlessly embedded into existing prompt learning methods for textual, visual, or multi-modal prompt learning. Our diffusion model uses a fast ODE-based sampling strategy to optimize test sample prompts in just five steps, offering a good trade-off between performance improvement and computational efficiency. For all prompt learning methods tested, adding prompt diffusion yields more robust results for base-to-new generalization, cross-dataset generalization, and domain generalization in classification tasks tested over 15 diverse datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20164v1-abstract-full').style.display = 'none'; document.getElementById('2410.20164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17922">arXiv:2410.17922</a> <span> [<a href="https://arxiv.org/pdf/2410.17922">pdf</a>, <a href="https://arxiv.org/format/2410.17922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Guide for Defense (G4D): Dynamic Guidance for Robust and Balanced Defense in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">He Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weidi Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijing Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bing Feng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17922v1-abstract-short" style="display: inline;"> With the extensive deployment of Large Language Models (LLMs), ensuring their safety has become increasingly critical. However, existing defense methods often struggle with two key issues: (i) inadequate defense capabilities, particularly in domain-specific scenarios like chemistry, where a lack of specialized knowledge can lead to the generation of harmful responses to malicious queries. (ii) ove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17922v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17922v1-abstract-full" style="display: none;"> With the extensive deployment of Large Language Models (LLMs), ensuring their safety has become increasingly critical. However, existing defense methods often struggle with two key issues: (i) inadequate defense capabilities, particularly in domain-specific scenarios like chemistry, where a lack of specialized knowledge can lead to the generation of harmful responses to malicious queries. (ii) over-defensiveness, which compromises the general utility and responsiveness of LLMs. To mitigate these issues, we introduce a multi-agents-based defense framework, Guide for Defense (G4D), which leverages accurate external information to provide an unbiased summary of user intentions and analytically grounded safety response guidance. Extensive experiments on popular jailbreak attacks and benign datasets show that our G4D can enhance LLM's robustness against jailbreak attacks on general and domain-specific scenarios without compromising the model's general functionality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17922v1-abstract-full').style.display = 'none'; document.getElementById('2410.17922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16257">arXiv:2410.16257</a> <span> [<a href="https://arxiv.org/pdf/2410.16257">pdf</a>, <a href="https://arxiv.org/format/2410.16257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Elucidating the design space of language models for image generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuantong Liu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyang Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16257v1-abstract-short" style="display: inline;"> The success of autoregressive (AR) language models in text generation has inspired the computer vision community to adopt Large Language Models (LLMs) for image generation. However, considering the essential differences between text and image modalities, the design space of language models for image generation remains underexplored. We observe that image tokens exhibit greater randomness compared… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16257v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16257v1-abstract-full" style="display: none;"> The success of autoregressive (AR) language models in text generation has inspired the computer vision community to adopt Large Language Models (LLMs) for image generation. However, considering the essential differences between text and image modalities, the design space of language models for image generation remains underexplored. We observe that image tokens exhibit greater randomness compared to text tokens, which presents challenges when training with token prediction. Nevertheless, AR models demonstrate their potential by effectively learning patterns even from a seemingly suboptimal optimization problem. Our analysis also reveals that while all models successfully grasp the importance of local information in image generation, smaller models struggle to capture the global context. In contrast, larger models showcase improved capabilities in this area, helping to explain the performance gains achieved when scaling up model size. We further elucidate the design space of language models for vision generation, including tokenizer choice, model choice, model scalability, vocabulary design, and sampling strategy through extensive comparative experiments. Our work is the first to analyze the optimization behavior of language models in vision generation, and we believe it can inspire more effective designs when applying LMs to other domains. Finally, our elucidated language model for image generation, termed as ELM, achieves state-of-the-art performance on the ImageNet 256*256 benchmark. The code is available at https://github.com/Pepperlll/LMforImageGeneration.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16257v1-abstract-full').style.display = 'none'; document.getElementById('2410.16257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://pepper-lll.github.io/LMforImageGeneration/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14843">arXiv:2410.14843</a> <span> [<a href="https://arxiv.org/pdf/2410.14843">pdf</a>, <a href="https://arxiv.org/format/2410.14843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Predictive variational inference: Learn the predictively optimal posterior distribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+J">Jinlin Lai</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuling Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14843v1-abstract-short" style="display: inline;"> Vanilla variational inference finds an optimal approximation to the Bayesian posterior distribution, but even the exact Bayesian posterior is often not meaningful under model misspecification. We propose predictive variational inference (PVI): a general inference framework that seeks and samples from an optimal posterior density such that the resulting posterior predictive distribution is as close… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14843v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14843v1-abstract-full" style="display: none;"> Vanilla variational inference finds an optimal approximation to the Bayesian posterior distribution, but even the exact Bayesian posterior is often not meaningful under model misspecification. We propose predictive variational inference (PVI): a general inference framework that seeks and samples from an optimal posterior density such that the resulting posterior predictive distribution is as close to the true data generating process as possible, while this this closeness is measured by multiple scoring rules. By optimizing the objective, the predictive variational inference is generally not the same as, or even attempting to approximate, the Bayesian posterior, even asymptotically. Rather, we interpret it as implicit hierarchical expansion. Further, the learned posterior uncertainty detects heterogeneity of parameters among the population, enabling automatic model diagnosis. This framework applies to both likelihood-exact and likelihood-free models. We demonstrate its application in real data examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14843v1-abstract-full').style.display = 'none'; document.getElementById('2410.14843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14716">arXiv:2410.14716</a> <span> [<a href="https://arxiv.org/pdf/2410.14716">pdf</a>, <a href="https://arxiv.org/format/2410.14716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Survey on Large Language Models for Algorithm Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fei Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yiming Yao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Ping Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xi Lin</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+X">Xialiang Tong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mingxuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhichao Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenkun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingfu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14716v3-abstract-short" style="display: inline;"> Algorithm Design (AD) is crucial for effective problem-solving across various domains. The advent of Large Language Models (LLMs) has notably enhanced the automation and innovation within this field, offering new perspectives and promising solutions. Over the past three years, the integration of LLMs into AD (LLM4AD) has seen substantial progress, with applications spanning optimization, machine l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14716v3-abstract-full').style.display = 'inline'; document.getElementById('2410.14716v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14716v3-abstract-full" style="display: none;"> Algorithm Design (AD) is crucial for effective problem-solving across various domains. The advent of Large Language Models (LLMs) has notably enhanced the automation and innovation within this field, offering new perspectives and promising solutions. Over the past three years, the integration of LLMs into AD (LLM4AD) has seen substantial progress, with applications spanning optimization, machine learning, mathematical reasoning, and scientific discovery. Given the rapid advancements and expanding scope of this field, a systematic review is both timely and necessary. This paper provides a systematic review of LLM4AD. First, we offer an overview and summary of existing studies. Then, we introduce a taxonomy and review the literature across four dimensions: the roles of LLMs, search methods, prompt methods, and application domains with a discussion of potential and achievements of LLMs in AD. Finally, we identify current challenges and highlight several promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14716v3-abstract-full').style.display = 'none'; document.getElementById('2410.14716v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13387">arXiv:2410.13387</a> <span> [<a href="https://arxiv.org/pdf/2410.13387">pdf</a>, <a href="https://arxiv.org/format/2410.13387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> CLEAR: Towards Contextual LLM-Empowered Privacy Policy Analysis and Risk Generation for Large Language Model Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoran Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Daodao Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yanfang Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T+J">Toby Jia-jun Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yaxing Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13387v2-abstract-short" style="display: inline;"> The rise of end-user applications powered by large language models (LLMs), including both conversational interfaces and add-ons to existing graphical user interfaces (GUIs), introduces new privacy challenges. However, many users remain unaware of the risks. This paper explores methods to increase user awareness of privacy risks associated with LLMs in end-user applications. We conducted five co-de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13387v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13387v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13387v2-abstract-full" style="display: none;"> The rise of end-user applications powered by large language models (LLMs), including both conversational interfaces and add-ons to existing graphical user interfaces (GUIs), introduces new privacy challenges. However, many users remain unaware of the risks. This paper explores methods to increase user awareness of privacy risks associated with LLMs in end-user applications. We conducted five co-design workshops to uncover user privacy concerns and their demand for contextual privacy information within LLMs. Based on these insights, we developed CLEAR (Contextual LLM-Empowered Privacy Policy Analysis and Risk Generation), a just-in-time contextual assistant designed to help users identify sensitive information, summarize relevant privacy policies, and highlight potential risks when sharing information with LLMs. We evaluated the usability and usefulness of CLEAR across in two example domains: ChatGPT and the Gemini plugin in Gmail. Our findings demonstrated that CLEAR is easy to use and improves user understanding of data practices and privacy risks. We also discussed LLM's duality in posing and mitigating privacy risks, offering design and policy implications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13387v2-abstract-full').style.display = 'none'; document.getElementById('2410.13387v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13195">arXiv:2410.13195</a> <span> [<a href="https://arxiv.org/pdf/2410.13195">pdf</a>, <a href="https://arxiv.org/format/2410.13195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniG: Modelling Unitary 3D Gaussians for View-consistent 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiamin Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kenkun Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yukai Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoke Jiang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13195v2-abstract-short" style="display: inline;"> In this work, we present UniG, a view-consistent 3D reconstruction and novel view synthesis model that generates a high-fidelity representation of 3D Gaussians from sparse images. Existing 3D Gaussians-based methods usually regress Gaussians per-pixel of each view, create 3D Gaussians per view separately, and merge them through point concatenation. Such a view-independent reconstruction approach o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13195v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13195v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13195v2-abstract-full" style="display: none;"> In this work, we present UniG, a view-consistent 3D reconstruction and novel view synthesis model that generates a high-fidelity representation of 3D Gaussians from sparse images. Existing 3D Gaussians-based methods usually regress Gaussians per-pixel of each view, create 3D Gaussians per view separately, and merge them through point concatenation. Such a view-independent reconstruction approach often results in a view inconsistency issue, where the predicted positions of the same 3D point from different views may have discrepancies. To address this problem, we develop a DETR (DEtection TRansformer)-like framework, which treats 3D Gaussians as decoder queries and updates their parameters layer by layer by performing multi-view cross-attention (MVDFA) over multiple input images. In this way, multiple views naturally contribute to modeling a unitary representation of 3D Gaussians, thereby making 3D reconstruction more view-consistent. Moreover, as the number of 3D Gaussians used as decoder queries is irrespective of the number of input views, allow an arbitrary number of input images without causing memory explosion. Extensive experiments validate the advantages of our approach, showcasing superior performance over existing methods quantitatively (improving PSNR by 4.2 dB when trained on Objaverse and tested on the GSO benchmark) and qualitatively. The code will be released at https://github.com/jwubz123/UNIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13195v2-abstract-full').style.display = 'none'; document.getElementById('2410.13195v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12613">arXiv:2410.12613</a> <span> [<a href="https://arxiv.org/pdf/2410.12613">pdf</a>, <a href="https://arxiv.org/format/2410.12613">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Exploring Model Kinship for Merging Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yedi Hu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yunzhi Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shumin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12613v1-abstract-short" style="display: inline;"> Model merging has become one of the key technologies for enhancing the capabilities and efficiency of Large Language Models (LLMs). However, our understanding of the expected performance gains and principles when merging any two models remains limited. In this work, we introduce model kinship, the degree of similarity or relatedness between LLMs, analogous to biological evolution. With comprehensi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12613v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12613v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12613v1-abstract-full" style="display: none;"> Model merging has become one of the key technologies for enhancing the capabilities and efficiency of Large Language Models (LLMs). However, our understanding of the expected performance gains and principles when merging any two models remains limited. In this work, we introduce model kinship, the degree of similarity or relatedness between LLMs, analogous to biological evolution. With comprehensive empirical analysis, we find that there is a certain relationship between model kinship and the performance gains after model merging, which can help guide our selection of candidate models. Inspired by this, we propose a new model merging strategy: Top-k Greedy Merging with Model Kinship, which can yield better performance on benchmark datasets. Specifically, we discover that using model kinship as a criterion can assist us in continuously performing model merging, alleviating the degradation (local optima) in model evolution, whereas model kinship can serve as a guide to escape these traps. Code is available at https://github.com/zjunlp/ModelKinship. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12613v1-abstract-full').style.display = 'none'; document.getElementById('2410.12613v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ongoing work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11410">arXiv:2410.11410</a> <span> [<a href="https://arxiv.org/pdf/2410.11410">pdf</a>, <a href="https://arxiv.org/format/2410.11410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PMMT: Preference Alignment in Multilingual Machine Translation via LLM Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shuqiao Sun</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yutong Yao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peiwen Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feijun Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11410v1-abstract-short" style="display: inline;"> Translation is important for cross-language communication, and many efforts have been made to improve its accuracy. However, less investment is conducted in aligning translations with human preferences, such as translation tones or styles. In this paper, a new method is proposed to effectively generate large-scale multilingual parallel corpora with specific translation preferences using Large Lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11410v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11410v1-abstract-full" style="display: none;"> Translation is important for cross-language communication, and many efforts have been made to improve its accuracy. However, less investment is conducted in aligning translations with human preferences, such as translation tones or styles. In this paper, a new method is proposed to effectively generate large-scale multilingual parallel corpora with specific translation preferences using Large Language Models (LLMs). Meanwhile, an automatic pipeline is designed to distill human preferences into smaller Machine Translation (MT) models for efficiently and economically supporting large-scale calls in online services. Experiments indicate that the proposed method takes the lead in translation tasks with aligned human preferences by a large margin. Meanwhile, on popular public benchmarks like WMT and Flores, on which our models were not trained, the proposed method also shows a competitive performance compared to SOTA works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11410v1-abstract-full').style.display = 'none'; document.getElementById('2410.11410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10770">arXiv:2410.10770</a> <span> [<a href="https://arxiv.org/pdf/2410.10770">pdf</a>, <a href="https://arxiv.org/ps/2410.10770">ps</a>, <a href="https://arxiv.org/format/2410.10770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Exponents for classical-quantum channel simulation in purified distance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oufkir%2C+A">Aadil Oufkir</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yongsheng Yao</a>, <a href="/search/cs?searchtype=author&query=Berta%2C+M">Mario Berta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10770v1-abstract-short" style="display: inline;"> We determine the exact error and strong converse exponent for entanglement-assisted classical-quantum channel simulation in worst case input purified distance. The error exponent is expressed as a single-letter formula optimized over sandwiched R茅nyi divergences of order $伪\in [1, \infty)$, notably without the need for a critical rate--a sharp contrast to the error exponent for classical-quantum c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10770v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10770v1-abstract-full" style="display: none;"> We determine the exact error and strong converse exponent for entanglement-assisted classical-quantum channel simulation in worst case input purified distance. The error exponent is expressed as a single-letter formula optimized over sandwiched R茅nyi divergences of order $伪\in [1, \infty)$, notably without the need for a critical rate--a sharp contrast to the error exponent for classical-quantum channel coding. The strong converse exponent is expressed as a single-letter formula optimized over sandwiched R茅nyi divergences of order $伪\in [\frac{1}{2},1]$. As in the classical work [Oufkir et al., arXiv:2410.07051], we start with the goal of asymptotically expanding the meta-converse for channel simulation in the relevant regimes. However, to deal with non-commutativity issues arising from classical-quantum channels and entanglement-assistance, we critically use various properties of the quantum fidelity, additional auxiliary channel techniques, approximations via Chebyshev inequalities, and entropic continuity bounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10770v1-abstract-full').style.display = 'none'; document.getElementById('2410.10770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27+5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07718">arXiv:2410.07718</a> <span> [<a href="https://arxiv.org/pdf/2410.07718">pdf</a>, <a href="https://arxiv.org/format/2410.07718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hallo2: Long-Duration and High-Resolution Audio-Driven Portrait Image Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiahao Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hui Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+H">Hanlin Shang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Kaihui Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07718v2-abstract-short" style="display: inline;"> Recent advances in latent diffusion-based generative models for portrait image animation, such as Hallo, have achieved impressive results in short-duration video synthesis. In this paper, we present updates to Hallo, introducing several design enhancements to extend its capabilities. First, we extend the method to produce long-duration videos. To address substantial challenges such as appearance d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07718v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07718v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07718v2-abstract-full" style="display: none;"> Recent advances in latent diffusion-based generative models for portrait image animation, such as Hallo, have achieved impressive results in short-duration video synthesis. In this paper, we present updates to Hallo, introducing several design enhancements to extend its capabilities. First, we extend the method to produce long-duration videos. To address substantial challenges such as appearance drift and temporal artifacts, we investigate augmentation strategies within the image space of conditional motion frames. Specifically, we introduce a patch-drop technique augmented with Gaussian noise to enhance visual consistency and temporal coherence over long duration. Second, we achieve 4K resolution portrait video generation. To accomplish this, we implement vector quantization of latent codes and apply temporal alignment techniques to maintain coherence across the temporal dimension. By integrating a high-quality decoder, we realize visual synthesis at 4K resolution. Third, we incorporate adjustable semantic textual labels for portrait expressions as conditional inputs. This extends beyond traditional audio cues to improve controllability and increase the diversity of the generated content. To the best of our knowledge, Hallo2, proposed in this paper, is the first method to achieve 4K resolution and generate hour-long, audio-driven portrait image animations enhanced with textual prompts. We have conducted extensive experiments to evaluate our method on publicly available datasets, including HDTF, CelebV, and our introduced "Wild" dataset. The experimental results demonstrate that our approach achieves state-of-the-art performance in long-duration portrait video animation, successfully generating rich and controllable content at 4K resolution for duration extending up to tens of minutes. Project page https://fudan-generative-vision.github.io/hallo2 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07718v2-abstract-full').style.display = 'none'; document.getElementById('2410.07718v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06340">arXiv:2410.06340</a> <span> [<a href="https://arxiv.org/pdf/2410.06340">pdf</a>, <a href="https://arxiv.org/format/2410.06340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedGraph: A Research Library and Benchmark for Federated Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuhang Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xinyi Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junhao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kay Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Weizhao Jin</a>, <a href="/search/cs?searchtype=author&query=Ravi%2C+S">Srivatsan Ravi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Joe-Wong%2C+C">Carlee Joe-Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06340v2-abstract-short" style="display: inline;"> Federated graph learning is an emerging field with significant practical challenges. While many algorithms have been proposed to enhance the accuracy of training graph neural networks, e.g., for node classification problems on large graphs, in a federated manner, their system performance is often overlooked, even though it is crucial for real-world deployment. To address this gap, we introduce Fed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06340v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06340v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06340v2-abstract-full" style="display: none;"> Federated graph learning is an emerging field with significant practical challenges. While many algorithms have been proposed to enhance the accuracy of training graph neural networks, e.g., for node classification problems on large graphs, in a federated manner, their system performance is often overlooked, even though it is crucial for real-world deployment. To address this gap, we introduce FedGraph, a research library built for practical distributed deployment and benchmarking in federated graph learning. FedGraph supports a range of state-of-the-art graph learning methods and includes built-in profiling tools to evaluate system performance, focusing specifically on communication and computation costs during training. Unlike existing benchmark platforms, FedGraph natively incorporates homomorphic encryption to enhance privacy preservation and facilitates the development of practical applications by enabling distributed training across multiple physical machines, providing an evaluation framework that can guide the system design of future federated graph learning algorithms. Leveraging these optimizations, we use FedGraph to demonstrate the first privacy-preserving federated learning system to run on graphs with 100 million nodes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06340v2-abstract-full').style.display = 'none'; document.getElementById('2410.06340v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/FedGraph/fedgraph</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04917">arXiv:2410.04917</a> <span> [<a href="https://arxiv.org/pdf/2410.04917">pdf</a>, <a href="https://arxiv.org/format/2410.04917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Why am I seeing this: Democratizing End User Auditing for Online Content Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoran Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Leyang Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+L">Luke Cao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yanfang Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianshi Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yaxing Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T+J">Toby Jia-jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04917v1-abstract-short" style="display: inline;"> Personalized recommendation systems tailor content based on user attributes, which are either provided or inferred from private data. Research suggests that users often hypothesize about reasons behind contents they encounter (e.g., "I see this jewelry ad because I am a woman"), but they lack the means to confirm these hypotheses due to the opaqueness of these systems. This hinders informed decisi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04917v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04917v1-abstract-full" style="display: none;"> Personalized recommendation systems tailor content based on user attributes, which are either provided or inferred from private data. Research suggests that users often hypothesize about reasons behind contents they encounter (e.g., "I see this jewelry ad because I am a woman"), but they lack the means to confirm these hypotheses due to the opaqueness of these systems. This hinders informed decision-making about privacy and system use and contributes to the lack of algorithmic accountability. To address these challenges, we introduce a new interactive sandbox approach. This approach creates sets of synthetic user personas and corresponding personal data that embody realistic variations in personal attributes, allowing users to test their hypotheses by observing how a website's algorithms respond to these personas. We tested the sandbox in the context of targeted advertisement. Our user study demonstrates its usability, usefulness, and effectiveness in empowering end-user auditing in a case study of targeting ads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04917v1-abstract-full').style.display = 'none'; document.getElementById('2410.04917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03955">arXiv:2410.03955</a> <span> [<a href="https://arxiv.org/pdf/2410.03955">pdf</a>, <a href="https://arxiv.org/format/2410.03955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Model Developmental Safety: A Safety-Centric Method and Applications in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wendi Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wei Tong</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingbin Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qihang Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03955v2-abstract-short" style="display: inline;"> In the real world, a learning-enabled system usually undergoes multiple cycles of model development to enhance the system's ability to handle difficult or emerging tasks. This continual model development process raises a significant issue that the model development for acquiring new or improving existing capabilities may inadvertently lose capabilities of the old model, also known as catastrophic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03955v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03955v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03955v2-abstract-full" style="display: none;"> In the real world, a learning-enabled system usually undergoes multiple cycles of model development to enhance the system's ability to handle difficult or emerging tasks. This continual model development process raises a significant issue that the model development for acquiring new or improving existing capabilities may inadvertently lose capabilities of the old model, also known as catastrophic forgetting. Existing continual learning studies focus on mitigating catastrophic forgetting by trading off performance on previous tasks and new tasks to ensure good average performance. However, they are inadequate for many applications especially in safety-critical domains, as failure to strictly preserve the performance of the old model not only introduces safety risks and uncertainties but also imposes substantial expenses in the re-improving and re-validation of existing properties. To address this issue, we introduce model developmental safety as a guarantee of a learning system such that in the model development process the new model should strictly preserve the existing protected capabilities of the old model while improving its performance on target tasks. To ensure the model developmental safety, we present a safety-centric framework by formulating the model developmental safety as data-dependent constraints. Under this framework, we study how to develop a pretrained vision-language model (aka the CLIP model) for acquiring new capabilities or improving existing capabilities of image classification. We propose an efficient constrained optimization algorithm with theoretical guarantee and use its insights to finetune a CLIP model with task-dependent heads for promoting the model developmental safety. Our experiments on improving vision perception capabilities on autonomous driving and scene recognition datasets demonstrate the efficacy of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03955v2-abstract-full').style.display = 'none'; document.getElementById('2410.03955v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03777">arXiv:2410.03777</a> <span> [<a href="https://arxiv.org/pdf/2410.03777">pdf</a>, <a href="https://arxiv.org/format/2410.03777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Determine-Then-Ensemble: Necessity of Top-k Union for Large Language Model Ensembling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuxuan Yao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Han Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingyang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Sichun Luo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiongwei Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03777v1-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit varying strengths and weaknesses across different tasks, prompting recent studies to explore the benefits of ensembling models to leverage their complementary advantages. However, existing LLM ensembling methods often overlook model compatibility and struggle with inefficient alignment of probabilities across the entire vocabulary. In this study, we empirically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03777v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03777v1-abstract-full" style="display: none;"> Large language models (LLMs) exhibit varying strengths and weaknesses across different tasks, prompting recent studies to explore the benefits of ensembling models to leverage their complementary advantages. However, existing LLM ensembling methods often overlook model compatibility and struggle with inefficient alignment of probabilities across the entire vocabulary. In this study, we empirically investigate the factors influencing ensemble performance, identifying model performance, vocabulary size, and response style as key determinants, revealing that compatibility among models is essential for effective ensembling. This analysis leads to the development of a simple yet effective model selection strategy that identifies compatible models. Additionally, we introduce the \textsc{Uni}on \textsc{T}op-$k$ \textsc{E}nsembling (\textsc{UniTE}), a novel approach that efficiently combines models by focusing on the union of the top-k tokens from each model, thereby avoiding the need for full vocabulary alignment and reducing computational overhead. Extensive evaluations across multiple benchmarks demonstrate that \textsc{UniTE} significantly enhances performance compared to existing methods, offering a more efficient framework for LLM ensembling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03777v1-abstract-full').style.display = 'none'; document.getElementById('2410.03777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02644">arXiv:2410.02644</a> <span> [<a href="https://arxiv.org/pdf/2410.02644">pdf</a>, <a href="https://arxiv.org/format/2410.02644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-based Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanrong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingyuan Huang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+K">Kai Mei</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifei Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+C">Chenlu Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02644v1-abstract-short" style="display: inline;"> Although LLM-based agents, powered by Large Language Models (LLMs), can use external tools and memory mechanisms to solve complex real-world tasks, they may also introduce critical security vulnerabilities. However, the existing literature does not comprehensively evaluate attacks and defenses against LLM-based agents. To address this, we introduce Agent Security Bench (ASB), a comprehensive frame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02644v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02644v1-abstract-full" style="display: none;"> Although LLM-based agents, powered by Large Language Models (LLMs), can use external tools and memory mechanisms to solve complex real-world tasks, they may also introduce critical security vulnerabilities. However, the existing literature does not comprehensively evaluate attacks and defenses against LLM-based agents. To address this, we introduce Agent Security Bench (ASB), a comprehensive framework designed to formalize, benchmark, and evaluate the attacks and defenses of LLM-based agents, including 10 scenarios (e.g., e-commerce, autonomous driving, finance), 10 agents targeting the scenarios, over 400 tools, 23 different types of attack/defense methods, and 8 evaluation metrics. Based on ASB, we benchmark 10 prompt injection attacks, a memory poisoning attack, a novel Plan-of-Thought backdoor attack, a mixed attack, and 10 corresponding defenses across 13 LLM backbones with nearly 90,000 testing cases in total. Our benchmark results reveal critical vulnerabilities in different stages of agent operation, including system prompt, user prompt handling, tool usage, and memory retrieval, with the highest average attack success rate of 84.30\%, but limited effectiveness shown in current defenses, unveiling important works to be done in terms of agent security for the community. Our code can be found at https://github.com/agiresearch/ASB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02644v1-abstract-full').style.display = 'none'; document.getElementById('2410.02644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01226">arXiv:2410.01226</a> <span> [<a href="https://arxiv.org/pdf/2410.01226">pdf</a>, <a href="https://arxiv.org/format/2410.01226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Native Generative Model for 3D Head Avatar </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yiyu Zhuang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiao He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanwen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiahe Zhu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01226v1-abstract-short" style="display: inline;"> Creating 3D head avatars is a significant yet challenging task for many applicated scenarios. Previous studies have set out to learn 3D human head generative models using massive 2D image data. Although these models are highly generalizable for human appearance, their result models are not 360$^\circ$-renderable, and the predicted 3D geometry is unreliable. Therefore, such results cannot be used i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01226v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01226v1-abstract-full" style="display: none;"> Creating 3D head avatars is a significant yet challenging task for many applicated scenarios. Previous studies have set out to learn 3D human head generative models using massive 2D image data. Although these models are highly generalizable for human appearance, their result models are not 360$^\circ$-renderable, and the predicted 3D geometry is unreliable. Therefore, such results cannot be used in VR, game modeling, and other scenarios that require 360$^\circ$-renderable 3D head models. An intuitive idea is that 3D head models with limited amount but high 3D accuracy are more reliable training data for a high-quality 3D generative model. In this vein, we delve into how to learn a native generative model for 360$^\circ$ full head from a limited 3D head dataset. Specifically, three major problems are studied: 1) how to effectively utilize various representations for generating the 360$^\circ$-renderable human head; 2) how to disentangle the appearance, shape, and motion of human faces to generate a 3D head model that can be edited by appearance and driven by motion; 3) and how to extend the generalization capability of the generative model to support downstream tasks. Comprehensive experiments are conducted to verify the effectiveness of the proposed model. We hope the proposed models and artist-designed dataset can inspire future research on learning native generative 3D head models from limited 3D datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01226v1-abstract-full').style.display = 'none'; document.getElementById('2410.01226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20181">arXiv:2409.20181</a> <span> [<a href="https://arxiv.org/pdf/2409.20181">pdf</a>, <a href="https://arxiv.org/format/2409.20181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reference Trustable Decoding: A Training-Free Augmentation Paradigm for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+L">Luohe Shi</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zuchao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lefei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20181v2-abstract-short" style="display: inline;"> Large language models (LLMs) have rapidly advanced and demonstrated impressive capabilities. In-Context Learning (ICL) and Parameter-Efficient Fine-Tuning (PEFT) are currently two mainstream methods for augmenting LLMs to downstream tasks. ICL typically constructs a few-shot learning scenario, either manually or by setting up a Retrieval-Augmented Generation (RAG) system, helping models quickly gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20181v2-abstract-full').style.display = 'inline'; document.getElementById('2409.20181v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20181v2-abstract-full" style="display: none;"> Large language models (LLMs) have rapidly advanced and demonstrated impressive capabilities. In-Context Learning (ICL) and Parameter-Efficient Fine-Tuning (PEFT) are currently two mainstream methods for augmenting LLMs to downstream tasks. ICL typically constructs a few-shot learning scenario, either manually or by setting up a Retrieval-Augmented Generation (RAG) system, helping models quickly grasp domain knowledge or question-answering patterns without changing model parameters. However, this approach involves trade-offs, such as slower inference speed and increased space occupancy. PEFT assists the model in adapting to tasks through minimal parameter modifications, but the training process still demands high hardware requirements, even with a small number of parameters involved. To address these challenges, we propose Reference Trustable Decoding (RTD), a paradigm that allows models to quickly adapt to new tasks without fine-tuning, maintaining low inference costs. RTD constructs a reference datastore from the provided training examples and optimizes the LLM's final vocabulary distribution by flexibly selecting suitable references based on the input, resulting in more trustable responses and enabling the model to adapt to downstream tasks at a low cost. Experimental evaluations on various LLMs using different benchmarks demonstrate that RTD establishes a new paradigm for augmenting models to downstream tasks. Furthermore, our method exhibits strong orthogonality with traditional methods, allowing for concurrent usage. Our code can be found at https://github.com/ShiLuohe/ReferenceTrustableDecoding <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20181v2-abstract-full').style.display = 'none'; document.getElementById('2409.20181v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16160">arXiv:2409.16160</a> <span> [<a href="https://arxiv.org/pdf/2409.16160">pdf</a>, <a href="https://arxiv.org/format/2409.16160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MIMO: Controllable Character Video Synthesis with Spatial Decomposed Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Men%2C+Y">Yifang Men</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+M">Miaomiao Cui</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16160v1-abstract-short" style="display: inline;"> Character video synthesis aims to produce realistic videos of animatable characters within lifelike scenes. As a fundamental problem in the computer vision and graphics community, 3D works typically require multi-view captures for per-case training, which severely limits their applicability of modeling arbitrary characters in a short time. Recent 2D methods break this limitation via pre-trained di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16160v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16160v1-abstract-full" style="display: none;"> Character video synthesis aims to produce realistic videos of animatable characters within lifelike scenes. As a fundamental problem in the computer vision and graphics community, 3D works typically require multi-view captures for per-case training, which severely limits their applicability of modeling arbitrary characters in a short time. Recent 2D methods break this limitation via pre-trained diffusion models, but they struggle for pose generality and scene interaction. To this end, we propose MIMO, a novel framework which can not only synthesize character videos with controllable attributes (i.e., character, motion and scene) provided by simple user inputs, but also simultaneously achieve advanced scalability to arbitrary characters, generality to novel 3D motions, and applicability to interactive real-world scenes in a unified framework. The core idea is to encode the 2D video to compact spatial codes, considering the inherent 3D nature of video occurrence. Concretely, we lift the 2D frame pixels into 3D using monocular depth estimators, and decompose the video clip to three spatial components (i.e., main human, underlying scene, and floating occlusion) in hierarchical layers based on the 3D depth. These components are further encoded to canonical identity code, structured motion code and full scene code, which are utilized as control signals of synthesis process. The design of spatial decomposed modeling enables flexible user control, complex motion expression, as well as 3D-aware synthesis for scene interactions. Experimental results demonstrate effectiveness and robustness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16160v1-abstract-full').style.display = 'none'; document.getElementById('2409.16160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://menyifang.github.io/projects/MIMO/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16056">arXiv:2409.16056</a> <span> [<a href="https://arxiv.org/pdf/2409.16056">pdf</a>, <a href="https://arxiv.org/format/2409.16056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Watermarking for Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuguang Yao</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A">Anil Jain</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sijia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16056v1-abstract-short" style="display: inline;"> Watermarking is an essential technique for embedding an identifier (i.e., watermark message) within digital images to assert ownership and monitor unauthorized alterations. In face recognition systems, watermarking plays a pivotal role in ensuring data integrity and security. However, an adversary could potentially interfere with the watermarking process, significantly impairing recognition perfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16056v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16056v1-abstract-full" style="display: none;"> Watermarking is an essential technique for embedding an identifier (i.e., watermark message) within digital images to assert ownership and monitor unauthorized alterations. In face recognition systems, watermarking plays a pivotal role in ensuring data integrity and security. However, an adversary could potentially interfere with the watermarking process, significantly impairing recognition performance. We explore the interaction between watermarking and adversarial attacks on face recognition models. Our findings reveal that while watermarking or input-level perturbation alone may have a negligible effect on recognition accuracy, the combined effect of watermarking and perturbation can result in an adversarial watermarking attack, significantly degrading recognition performance. Specifically, we introduce a novel threat model, the adversarial watermarking attack, which remains stealthy in the absence of watermarking, allowing images to be correctly recognized initially. However, once watermarking is applied, the attack is activated, causing recognition failures. Our study reveals a previously unrecognized vulnerability: adversarial perturbations can exploit the watermark message to evade face recognition systems. Evaluated on the CASIA-WebFace dataset, our proposed adversarial watermarking attack reduces face matching accuracy by 67.2% with an $\ell_\infty$ norm-measured perturbation strength of ${2}/{255}$ and by 95.9% with a strength of ${4}/{255}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16056v1-abstract-full').style.display = 'none'; document.getElementById('2409.16056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15723">arXiv:2409.15723</a> <span> [<a href="https://arxiv.org/pdf/2409.15723">pdf</a>, <a href="https://arxiv.org/ps/2409.15723">ps</a>, <a href="https://arxiv.org/format/2409.15723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Federated Large Language Models: Current Progress and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuhang Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chengkai Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yu Xia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R">Ryan Rossi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lina Yao</a>, <a href="/search/cs?searchtype=author&query=McAuley%2C+J">Julian McAuley</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/cs?searchtype=author&query=Joe-Wong%2C+C">Carlee Joe-Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15723v1-abstract-short" style="display: inline;"> Large language models are rapidly gaining popularity and have been widely adopted in real-world applications. While the quality of training data is essential, privacy concerns arise during data collection. Federated learning offers a solution by allowing multiple clients to collaboratively train LLMs without sharing local data. However, FL introduces new challenges, such as model convergence issue… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15723v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15723v1-abstract-full" style="display: none;"> Large language models are rapidly gaining popularity and have been widely adopted in real-world applications. While the quality of training data is essential, privacy concerns arise during data collection. Federated learning offers a solution by allowing multiple clients to collaboratively train LLMs without sharing local data. However, FL introduces new challenges, such as model convergence issues due to heterogeneous data and high communication costs. A comprehensive study is required to address these challenges and guide future research. This paper surveys Federated learning for LLMs (FedLLM), highlighting recent advances and future directions. We focus on two key aspects: fine-tuning and prompt learning in a federated setting, discussing existing work and associated research challenges. We finally propose potential research directions for federated LLMs, including pre-training and how LLMs can further enhance federated learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15723v1-abstract-full').style.display = 'none'; document.getElementById('2409.15723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11844">arXiv:2409.11844</a> <span> [<a href="https://arxiv.org/pdf/2409.11844">pdf</a>, <a href="https://arxiv.org/format/2409.11844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MEOW: MEMOry Supervised LLM Unlearning Via Inverted Facts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tianle Gu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kexin Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuanqi Yao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+Y">Yan Teng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingchun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11844v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) can memorize sensitive information, raising concerns about potential misuse. LLM Unlearning, a post-hoc approach to remove this information from trained LLMs, offers a promising solution to mitigate these risks. However, previous practices face three key challenges: 1. Utility: successful unlearning often causes catastrophic collapse on unrelated tasks. 2. Efficiency:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11844v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11844v1-abstract-full" style="display: none;"> Large Language Models (LLMs) can memorize sensitive information, raising concerns about potential misuse. LLM Unlearning, a post-hoc approach to remove this information from trained LLMs, offers a promising solution to mitigate these risks. However, previous practices face three key challenges: 1. Utility: successful unlearning often causes catastrophic collapse on unrelated tasks. 2. Efficiency: many methods either involve adding similarly sized models, which slows down unlearning or inference, or require retain data that are difficult to obtain. 3. Robustness: even effective methods may still leak data via extraction techniques. To address these challenges, we propose MEOW, a simple yet effective gradient descent-based unlearning method. Specifically, we use an offline LLM to generate a set of inverted facts. Then, we design a new metric, MEMO, to quantify memorization in LLMs. Finally, based on the signals provided by MEMO, we select the most appropriate set of inverted facts and finetune the model based on them. We evaluate MEOW on the commonly used unlearn benchmark, ToFU, with Llama2-7B-Chat and Phi-1.5B, and test it on both NLU and NLG tasks. Results demonstrate significant improvement of MEOW in forget quality without substantial loss in model utility. Meanwhile, MEOW does not exhibit significant degradation in NLU or NLG capabilities, and there is even a slight improvement in NLU performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11844v1-abstract-full').style.display = 'none'; document.getElementById('2409.11844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09366">arXiv:2409.09366</a> <span> [<a href="https://arxiv.org/pdf/2409.09366">pdf</a>, <a href="https://arxiv.org/format/2409.09366">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MHAD: Multimodal Home Activity Dataset with Multi-Angle Videos and Synchronized Physiological Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lei Yu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+J">Jintao Fei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yang Yao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09366v1-abstract-short" style="display: inline;"> Video-based physiology, exemplified by remote photoplethysmography (rPPG), extracts physiological signals such as pulse and respiration by analyzing subtle changes in video recordings. This non-contact, real-time monitoring method holds great potential for home settings. Despite the valuable contributions of public benchmark datasets to this technology, there is currently no dataset specifically d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09366v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09366v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09366v1-abstract-full" style="display: none;"> Video-based physiology, exemplified by remote photoplethysmography (rPPG), extracts physiological signals such as pulse and respiration by analyzing subtle changes in video recordings. This non-contact, real-time monitoring method holds great potential for home settings. Despite the valuable contributions of public benchmark datasets to this technology, there is currently no dataset specifically designed for passive home monitoring. Existing datasets are often limited to close-up, static, frontal recordings and typically include only 1-2 physiological signals. To advance video-based physiology in real home settings, we introduce the MHAD dataset. It comprises 1,440 videos from 40 subjects, capturing 6 typical activities from 3 angles in a real home environment. Additionally, 5 physiological signals were recorded, making it a comprehensive video-based physiology dataset. MHAD is compatible with the rPPG-toolbox and has been validated using several unsupervised and supervised methods. Our dataset is publicly available at https://github.com/jdh-algo/MHAD-Dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09366v1-abstract-full').style.display = 'none'; document.getElementById('2409.09366v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08904">arXiv:2409.08904</a> <span> [<a href="https://arxiv.org/pdf/2409.08904">pdf</a>, <a href="https://arxiv.org/format/2409.08904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AnyBipe: An End-to-End Framework for Training and Deploying Bipedal Robots Guided by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifei Yao</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wentao He</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chenyu Gu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiaheng Du</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+F">Fuwei Tan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhen Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junguo Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08904v1-abstract-short" style="display: inline;"> Training and deploying reinforcement learning (RL) policies for robots, especially in accomplishing specific tasks, presents substantial challenges. Recent advancements have explored diverse reward function designs, training techniques, simulation-to-reality (sim-to-real) transfers, and performance analysis methodologies, yet these still require significant human intervention. This paper introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08904v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08904v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08904v1-abstract-full" style="display: none;"> Training and deploying reinforcement learning (RL) policies for robots, especially in accomplishing specific tasks, presents substantial challenges. Recent advancements have explored diverse reward function designs, training techniques, simulation-to-reality (sim-to-real) transfers, and performance analysis methodologies, yet these still require significant human intervention. This paper introduces an end-to-end framework for training and deploying RL policies, guided by Large Language Models (LLMs), and evaluates its effectiveness on bipedal robots. The framework consists of three interconnected modules: an LLM-guided reward function design module, an RL training module leveraging prior work, and a sim-to-real homomorphic evaluation module. This design significantly reduces the need for human input by utilizing only essential simulation and deployment platforms, with the option to incorporate human-engineered strategies and historical data. We detail the construction of these modules, their advantages over traditional approaches, and demonstrate the framework's capability to autonomously develop and refine controlling strategies for bipedal robot locomotion, showcasing its potential to operate independently of human intervention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08904v1-abstract-full').style.display = 'none'; document.getElementById('2409.08904v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08688">arXiv:2409.08688</a> <span> [<a href="https://arxiv.org/pdf/2409.08688">pdf</a>, <a href="https://arxiv.org/format/2409.08688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> GenMapping: Unleashing the Potential of Inverse Perspective Mapping for Robust Online HD Map Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">You Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08688v1-abstract-short" style="display: inline;"> Online High-Definition (HD) maps have emerged as the preferred option for autonomous driving, overshadowing the counterpart offline HD maps due to flexible update capability and lower maintenance costs. However, contemporary online HD map models embed parameters of visual sensors into training, resulting in a significant decrease in generalization performance when applied to visual sensors with di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08688v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08688v1-abstract-full" style="display: none;"> Online High-Definition (HD) maps have emerged as the preferred option for autonomous driving, overshadowing the counterpart offline HD maps due to flexible update capability and lower maintenance costs. However, contemporary online HD map models embed parameters of visual sensors into training, resulting in a significant decrease in generalization performance when applied to visual sensors with different parameters. Inspired by the inherent potential of Inverse Perspective Mapping (IPM), where camera parameters are decoupled from the training process, we have designed a universal map generation framework, GenMapping. The framework is established with a triadic synergy architecture, including principal and dual auxiliary branches. When faced with a coarse road image with local distortion translated via IPM, the principal branch learns robust global features under the state space models. The two auxiliary branches are a dense perspective branch and a sparse prior branch. The former exploits the correlation information between static and moving objects, whereas the latter introduces the prior knowledge of OpenStreetMap (OSM). The triple-enhanced merging module is crafted to synergistically integrate the unique spatial features from all three branches. To further improve generalization capabilities, a Cross-View Map Learning (CVML) scheme is leveraged to realize joint learning within the common space. Additionally, a Bidirectional Data Augmentation (BiDA) module is introduced to mitigate reliance on datasets concurrently. A thorough array of experimental results shows that the proposed model surpasses current state-of-the-art methods in both semantic mapping and vectorized mapping, while also maintaining a rapid inference speed. The source code will be publicly available at https://github.com/lynn-yu/GenMapping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08688v1-abstract-full').style.display = 'none'; document.getElementById('2409.08688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The source code will be publicly available at https://github.com/lynn-yu/GenMapping</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07757">arXiv:2409.07757</a> <span> [<a href="https://arxiv.org/pdf/2409.07757">pdf</a>, <a href="https://arxiv.org/format/2409.07757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Uncertainty to Clarity: Uncertainty-Guided Class-Incremental Learning for Limited Biomedical Samples via Semantic Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifei Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanrong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07757v1-abstract-short" style="display: inline;"> In real-world clinical settings, data distributions evolve over time, with a continuous influx of new, limited disease cases. Therefore, class incremental learning is of great significance, i.e., deep learning models are required to learn new class knowledge while maintaining accurate recognition of previous diseases. However, traditional deep neural networks often suffer from severe forgetting of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07757v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07757v1-abstract-full" style="display: none;"> In real-world clinical settings, data distributions evolve over time, with a continuous influx of new, limited disease cases. Therefore, class incremental learning is of great significance, i.e., deep learning models are required to learn new class knowledge while maintaining accurate recognition of previous diseases. However, traditional deep neural networks often suffer from severe forgetting of prior knowledge when adapting to new data unless trained from scratch, which undesirably costs much time and computational burden. Additionally, the sample sizes for different diseases can be highly imbalanced, with newly emerging diseases typically having much fewer instances, consequently causing the classification bias. To tackle these challenges, we are the first to propose a class-incremental learning method under limited samples in the biomedical field. First, we propose a novel cumulative entropy prediction module to measure the uncertainty of the samples, of which the most uncertain samples are stored in a memory bank as exemplars for the model's later review. Furthermore, we theoretically demonstrate its effectiveness in measuring uncertainty. Second, we developed a fine-grained semantic expansion module through various augmentations, leading to more compact distributions within the feature space and creating sufficient room for generalization to new classes. Besides, a cosine classifier is utilized to mitigate classification bias caused by imbalanced datasets. Across four imbalanced data distributions over two datasets, our method achieves optimal performance, surpassing state-of-the-art methods by as much as 53.54% in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07757v1-abstract-full').style.display = 'none'; document.getElementById('2409.07757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07497">arXiv:2409.07497</a> <span> [<a href="https://arxiv.org/pdf/2409.07497">pdf</a>, <a href="https://arxiv.org/format/2409.07497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OneEdit: A Neural-Symbolic Collaboratively Knowledge Editing System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Z">Zekun Xi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yujie Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+B">Bozhong Tian</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yunzhi Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintian Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shumin Deng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Lei Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaowei Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07497v1-abstract-short" style="display: inline;"> Knowledge representation has been a central aim of AI since its inception. Symbolic Knowledge Graphs (KGs) and neural Large Language Models (LLMs) can both represent knowledge. KGs provide highly accurate and explicit knowledge representation, but face scalability issue; while LLMs offer expansive coverage of knowledge, but incur significant training costs and struggle with precise and reliable kn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07497v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07497v1-abstract-full" style="display: none;"> Knowledge representation has been a central aim of AI since its inception. Symbolic Knowledge Graphs (KGs) and neural Large Language Models (LLMs) can both represent knowledge. KGs provide highly accurate and explicit knowledge representation, but face scalability issue; while LLMs offer expansive coverage of knowledge, but incur significant training costs and struggle with precise and reliable knowledge manipulation. To this end, we introduce OneEdit, a neural-symbolic prototype system for collaborative knowledge editing using natural language, which facilitates easy-to-use knowledge management with KG and LLM. OneEdit consists of three modules: 1) The Interpreter serves for user interaction with natural language; 2) The Controller manages editing requests from various users, leveraging the KG with rollbacks to handle knowledge conflicts and prevent toxic knowledge attacks; 3) The Editor utilizes the knowledge from the Controller to edit KG and LLM. We conduct experiments on two new datasets with KGs which demonstrate that OneEdit can achieve superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07497v1-abstract-full').style.display = 'none'; document.getElementById('2409.07497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">LLM+KG@VLDB2024, code is available at https://github.com/zjunlp/OneEdit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07446">arXiv:2409.07446</a> <span> [<a href="https://arxiv.org/pdf/2409.07446">pdf</a>, <a href="https://arxiv.org/format/2409.07446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Adapter Routing for Long-Tailed Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhi-Hong Qi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Da-Wei Zhou</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yiran Yao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Han-Jia Ye</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+D">De-Chuan Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07446v1-abstract-short" style="display: inline;"> In our ever-evolving world, new data exhibits a long-tailed distribution, such as e-commerce platform reviews. This necessitates continuous model learning imbalanced data without forgetting, addressing the challenge of long-tailed class-incremental learning (LTCIL). Existing methods often rely on retraining linear classifiers with former data, which is impractical in real-world settings. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07446v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07446v1-abstract-full" style="display: none;"> In our ever-evolving world, new data exhibits a long-tailed distribution, such as e-commerce platform reviews. This necessitates continuous model learning imbalanced data without forgetting, addressing the challenge of long-tailed class-incremental learning (LTCIL). Existing methods often rely on retraining linear classifiers with former data, which is impractical in real-world settings. In this paper, we harness the potent representation capabilities of pre-trained models and introduce AdaPtive Adapter RouTing (APART) as an exemplar-free solution for LTCIL. To counteract forgetting, we train inserted adapters with frozen pre-trained weights for deeper adaptation and maintain a pool of adapters for selection during sequential model updates. Additionally, we present an auxiliary adapter pool designed for effective generalization, especially on minority classes. Adaptive instance routing across these pools captures crucial correlations, facilitating a comprehensive representation of all classes. Consequently, APART tackles the imbalance problem as well as catastrophic forgetting in a unified framework. Extensive benchmark experiments validate the effectiveness of APART. Code is available at: https://github.com/vita-qzh/APART <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07446v1-abstract-full').style.display = 'none'; document.getElementById('2409.07446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Machine Learning Journal. Code is available at: https://github.com/vita-qzh/APART</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07361">arXiv:2409.07361</a> <span> [<a href="https://arxiv.org/pdf/2409.07361">pdf</a>, <a href="https://arxiv.org/format/2409.07361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Knee Cartilage Shape and Lesion: From Image to Metrics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yongcheng Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weitian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07361v1-abstract-short" style="display: inline;"> Imaging features of knee articular cartilage have been shown to be potential imaging biomarkers for knee osteoarthritis. Despite recent methodological advancements in image analysis techniques like image segmentation, registration, and domain-specific image computing algorithms, only a few works focus on building fully automated pipelines for imaging feature extraction. In this study, we developed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07361v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07361v1-abstract-full" style="display: none;"> Imaging features of knee articular cartilage have been shown to be potential imaging biomarkers for knee osteoarthritis. Despite recent methodological advancements in image analysis techniques like image segmentation, registration, and domain-specific image computing algorithms, only a few works focus on building fully automated pipelines for imaging feature extraction. In this study, we developed a deep-learning-based medical image analysis application for knee cartilage morphometrics, CartiMorph Toolbox (CMT). We proposed a 2-stage joint template learning and registration network, CMT-reg. We trained the model using the OAI-ZIB dataset and assessed its performance in template-to-image registration. The CMT-reg demonstrated competitive results compared to other state-of-the-art models. We integrated the proposed model into an automated pipeline for the quantification of cartilage shape and lesion (full-thickness cartilage loss, specifically). The toolbox provides a comprehensive, user-friendly solution for medical image analysis and data visualization. The software and models are available at https://github.com/YongchengYAO/CMT-AMAI24paper . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07361v1-abstract-full').style.display = 'none'; document.getElementById('2409.07361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper will be in the conference proceedings of AMAI 2024. See the conference website: https://sites.google.com/view/amai2024/home</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06719">arXiv:2409.06719</a> <span> [<a href="https://arxiv.org/pdf/2409.06719">pdf</a>, <a href="https://arxiv.org/format/2409.06719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dual Adversarial Perturbators Generate rich Views for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Haibo Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06719v1-abstract-short" style="display: inline;"> Graph contrastive learning (GCL) has been extensively studied and leveraged as a potent tool in recommender systems. Most existing GCL-based recommenders generate contrastive views by altering the graph structure or introducing perturbations to embedding. While these methods effectively enhance learning from sparse data, they risk performance degradation or even training collapse when the differen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06719v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06719v1-abstract-full" style="display: none;"> Graph contrastive learning (GCL) has been extensively studied and leveraged as a potent tool in recommender systems. Most existing GCL-based recommenders generate contrastive views by altering the graph structure or introducing perturbations to embedding. While these methods effectively enhance learning from sparse data, they risk performance degradation or even training collapse when the differences between contrastive views become too pronounced. To mitigate this issue, we employ curriculum learning to incrementally increase the disparity between contrastive views, enabling the model to gain from more challenging scenarios. In this paper, we propose a dual-adversarial graph learning approach, AvoGCL, which emulates curriculum learning by progressively applying adversarial training to graph structures and embedding perturbations. Specifically, AvoGCL construct contrastive views by reducing graph redundancy and generating adversarial perturbations in the embedding space, and achieve better results by gradually increasing the difficulty of contrastive views. Extensive experiments on three real-world datasets demonstrate that AvoGCL significantly outperforms the state-of-the-art competitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06719v1-abstract-full').style.display = 'none'; document.getElementById('2409.06719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages,6 figures and 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06189">arXiv:2409.06189</a> <span> [<a href="https://arxiv.org/pdf/2409.06189">pdf</a>, <a href="https://arxiv.org/format/2409.06189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MyGo: Consistent and Controllable Multi-View Driving Video Generation with Camera Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yining Yao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xi Guo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chenjing Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06189v2-abstract-short" style="display: inline;"> High-quality driving video generation is crucial for providing training data for autonomous driving models. However, current generative models rarely focus on enhancing camera motion control under multi-view tasks, which is essential for driving video generation. Therefore, we propose MyGo, an end-to-end framework for video generation, introducing motion of onboard cameras as conditions to make pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06189v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06189v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06189v2-abstract-full" style="display: none;"> High-quality driving video generation is crucial for providing training data for autonomous driving models. However, current generative models rarely focus on enhancing camera motion control under multi-view tasks, which is essential for driving video generation. Therefore, we propose MyGo, an end-to-end framework for video generation, introducing motion of onboard cameras as conditions to make progress in camera controllability and multi-view consistency. MyGo employs additional plug-in modules to inject camera parameters into the pre-trained video diffusion model, which retains the extensive knowledge of the pre-trained model as much as possible. Furthermore, we use epipolar constraints and neighbor view information during the generation process of each view to enhance spatial-temporal consistency. Experimental results show that MyGo has achieved state-of-the-art results in both general camera-controlled video generation and multi-view driving video generation tasks, which lays the foundation for more accurate environment simulation in autonomous driving. Project page: https://metadrivescape.github.io/papers_project/MyGo/page.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06189v2-abstract-full').style.display = 'none'; document.getElementById('2409.06189v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://metadrivescape.github.io/papers_project/MyGo/page.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05806">arXiv:2409.05806</a> <span> [<a href="https://arxiv.org/pdf/2409.05806">pdf</a>, <a href="https://arxiv.org/format/2409.05806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Chinese Knowledge Rectification in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tianhe Lu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jizhan Fang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yunzhi Yao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05806v1-abstract-short" style="display: inline;"> While Large Language Models (LLMs) exhibit remarkable generative capabilities, they are not without flaws, particularly in the form of hallucinations. This issue is even more pronounced when LLMs are applied to specific languages and domains. For example, LLMs may generate nonsense information when handling Chinese ancient poetry, proverbs, or idioms, owing to the lack of specific knowledge. To th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05806v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05806v1-abstract-full" style="display: none;"> While Large Language Models (LLMs) exhibit remarkable generative capabilities, they are not without flaws, particularly in the form of hallucinations. This issue is even more pronounced when LLMs are applied to specific languages and domains. For example, LLMs may generate nonsense information when handling Chinese ancient poetry, proverbs, or idioms, owing to the lack of specific knowledge. To this end, this paper introduces a benchmark for rectifying Chinese knowledge in LLMs via knowledge editing. Specifically, we introduce a new Chinese dataset, CKnowEdit, by collecting seven type of knowledge from various sources, including classical texts, idioms, and content from Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony, antithesis, and logical constructs inherent in the Chinese language. Through the analysis of this dataset, we uncover the challenges faced by current LLMs in mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge editing techniques on this dataset unveil the substantial scope for advancement in the rectification of Chinese knowledge. Code and dataset are available at https://github.com/zjunlp/EasyEdit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05806v1-abstract-full').style.display = 'none'; document.getElementById('2409.05806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ongoing work; code and dataset are available at https://github.com/zjunlp/EasyEdit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03966">arXiv:2409.03966</a> <span> [<a href="https://arxiv.org/pdf/2409.03966">pdf</a>, <a href="https://arxiv.org/format/2409.03966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Automating Robot Failure Recovery Using Vision-Language Models With Optimized Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyi Chen</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yunchao Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruixuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Changliu Liu</a>, <a href="/search/cs?searchtype=author&query=Ichnowski%2C+J">Jeffrey Ichnowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03966v1-abstract-short" style="display: inline;"> Current robot autonomy struggles to operate beyond the assumed Operational Design Domain (ODD), the specific set of conditions and environments in which the system is designed to function, while the real-world is rife with uncertainties that may lead to failures. Automating recovery remains a significant challenge. Traditional methods often rely on human intervention to manually address failures o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03966v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03966v1-abstract-full" style="display: none;"> Current robot autonomy struggles to operate beyond the assumed Operational Design Domain (ODD), the specific set of conditions and environments in which the system is designed to function, while the real-world is rife with uncertainties that may lead to failures. Automating recovery remains a significant challenge. Traditional methods often rely on human intervention to manually address failures or require exhaustive enumeration of failure cases and the design of specific recovery policies for each scenario, both of which are labor-intensive. Foundational Vision-Language Models (VLMs), which demonstrate remarkable common-sense generalization and reasoning capabilities, have broader, potentially unbounded ODDs. However, limitations in spatial reasoning continue to be a common challenge for many VLMs when applied to robot control and motion-level error recovery. In this paper, we investigate how optimizing visual and text prompts can enhance the spatial reasoning of VLMs, enabling them to function effectively as black-box controllers for both motion-level position correction and task-level recovery from unknown failures. Specifically, the optimizations include identifying key visual elements in visual prompts, highlighting these elements in text prompts for querying, and decomposing the reasoning process for failure detection and control generation. In experiments, prompt optimizations significantly outperform pre-trained Vision-Language-Action Models in correcting motion-level position errors and improve accuracy by 65.78% compared to VLMs with unoptimized prompts. Additionally, for task-level failures, optimized prompts enhanced the success rate by 5.8%, 5.8%, and 7.5% in VLMs' abilities to detect failures, analyze issues, and generate recovery plans, respectively, across a wide range of unknown errors in Lego assembly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03966v1-abstract-full').style.display = 'none'; document.getElementById('2409.03966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03445">arXiv:2409.03445</a> <span> [<a href="https://arxiv.org/pdf/2409.03445">pdf</a>, <a href="https://arxiv.org/format/2409.03445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Neural HD Map Generation from Multiple Vectorized Tiles Locally Produced by Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+M">Miao Fan</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yi Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianping Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiangbo Song</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Daihui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03445v1-abstract-short" style="display: inline;"> High-definition (HD) map is a fundamental component of autonomous driving systems, as it can provide precise environmental information about driving scenes. Recent work on vectorized map generation could produce merely 65% local map elements around the ego-vehicle at runtime by one tour with onboard sensors, leaving a puzzle of how to construct a global HD map projected in the world coordinate sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03445v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03445v1-abstract-full" style="display: none;"> High-definition (HD) map is a fundamental component of autonomous driving systems, as it can provide precise environmental information about driving scenes. Recent work on vectorized map generation could produce merely 65% local map elements around the ego-vehicle at runtime by one tour with onboard sensors, leaving a puzzle of how to construct a global HD map projected in the world coordinate system under high-quality standards. To address the issue, we present GNMap as an end-to-end generative neural network to automatically construct HD maps with multiple vectorized tiles which are locally produced by autonomous vehicles through several tours. It leverages a multi-layer and attention-based autoencoder as the shared network, of which parameters are learned from two different tasks (i.e., pretraining and finetuning, respectively) to ensure both the completeness of generated maps and the correctness of element categories. Abundant qualitative evaluations are conducted on a real-world dataset and experimental results show that GNMap can surpass the SOTA method by more than 5% F1 score, reaching the level of industrial usage with a small amount of manual modification. We have already deployed it at Navinfo Co., Ltd., serving as an indispensable software to automatically build HD maps for autonomous driving systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03445v1-abstract-full').style.display = 'none'; document.getElementById('2409.03445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SpatialDI'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03346">arXiv:2409.03346</a> <span> [<a href="https://arxiv.org/pdf/2409.03346">pdf</a>, <a href="https://arxiv.org/format/2409.03346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sketch: A Toolkit for Streamlining LLM Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xin Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenjia Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xuezhi Fang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yiqun Yao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+N">Naitong Yu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xuying Meng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+P">Peng Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+A">Aixin Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yequan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03346v1-abstract-short" style="display: inline;"> Large language models (LLMs) represented by GPT family have achieved remarkable success. The characteristics of LLMs lie in their ability to accommodate a wide range of tasks through a generative approach. However, the flexibility of their output format poses challenges in controlling and harnessing the model's outputs, thereby constraining the application of LLMs in various domains. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03346v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03346v1-abstract-full" style="display: none;"> Large language models (LLMs) represented by GPT family have achieved remarkable success. The characteristics of LLMs lie in their ability to accommodate a wide range of tasks through a generative approach. However, the flexibility of their output format poses challenges in controlling and harnessing the model's outputs, thereby constraining the application of LLMs in various domains. In this work, we present Sketch, an innovative toolkit designed to streamline LLM operations across diverse fields. Sketch comprises the following components: (1) a suite of task description schemas and prompt templates encompassing various NLP tasks; (2) a user-friendly, interactive process for building structured output LLM services tailored to various NLP tasks; (3) an open-source dataset for output format control, along with tools for dataset construction; and (4) an open-source model based on LLaMA3-8B-Instruct that adeptly comprehends and adheres to output formatting instructions. We anticipate this initiative to bring considerable convenience to LLM users, achieving the goal of ''plug-and-play'' for various applications. The components of Sketch will be progressively open-sourced at https://github.com/cofe-ai/Sketch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03346v1-abstract-full').style.display = 'none'; document.getElementById('2409.03346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>