Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 313 results for author: <span class="mathjax">Hu, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Hu%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hu, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hu%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hu, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11974">arXiv:2502.11974</a> <span> [<a href="https://arxiv.org/pdf/2502.11974">pdf</a>, <a href="https://arxiv.org/format/2502.11974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image Inversion: A Survey from GANs to Diffusion and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Y">Yali Bi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Teng Hu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhucun Xue</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+Y">Ying Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11974v1-abstract-short" style="display: inline;"> Image inversion is a fundamental task in generative models, aiming to map images back to their latent representations to enable downstream applications such as editing, restoration, and style transfer. This paper provides a comprehensive review of the latest advancements in image inversion techniques, focusing on two main paradigms: Generative Adversarial Network (GAN) inversion and diffusion mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11974v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11974v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11974v1-abstract-full" style="display: none;"> Image inversion is a fundamental task in generative models, aiming to map images back to their latent representations to enable downstream applications such as editing, restoration, and style transfer. This paper provides a comprehensive review of the latest advancements in image inversion techniques, focusing on two main paradigms: Generative Adversarial Network (GAN) inversion and diffusion model inversion. We categorize these techniques based on their optimization methods. For GAN inversion, we systematically classify existing methods into encoder-based approaches, latent optimization approaches, and hybrid approaches, analyzing their theoretical foundations, technical innovations, and practical trade-offs. For diffusion model inversion, we explore training-free strategies, fine-tuning methods, and the design of additional trainable modules, highlighting their unique advantages and limitations. Additionally, we discuss several popular downstream applications and emerging applications beyond image tasks, identifying current challenges and future research directions. By synthesizing the latest developments, this paper aims to provide researchers and practitioners with a valuable reference resource, promoting further advancements in the field of image inversion. We keep track of the latest works at https://github.com/RyanChenYN/ImageInversion <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11974v1-abstract-full').style.display = 'none'; document.getElementById('2502.11974v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11744">arXiv:2502.11744</a> <span> [<a href="https://arxiv.org/pdf/2502.11744">pdf</a>, <a href="https://arxiv.org/format/2502.11744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FUNCTO: Function-Centric One-Shot Imitation Learning for Tool Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Anxing Xiao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuhong Deng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianrun Hu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wenlong Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+D">David Hsu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11744v1-abstract-short" style="display: inline;"> Learning tool use from a single human demonstration video offers a highly intuitive and efficient approach to robot teaching. While humans can effortlessly generalize a demonstrated tool manipulation skill to diverse tools that support the same function (e.g., pouring with a mug versus a teapot), current one-shot imitation learning (OSIL) methods struggle to achieve this. A key challenge lies in e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11744v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11744v1-abstract-full" style="display: none;"> Learning tool use from a single human demonstration video offers a highly intuitive and efficient approach to robot teaching. While humans can effortlessly generalize a demonstrated tool manipulation skill to diverse tools that support the same function (e.g., pouring with a mug versus a teapot), current one-shot imitation learning (OSIL) methods struggle to achieve this. A key challenge lies in establishing functional correspondences between demonstration and test tools, considering significant geometric variations among tools with the same function (i.e., intra-function variations). To address this challenge, we propose FUNCTO (Function-Centric OSIL for Tool Manipulation), an OSIL method that establishes function-centric correspondences with a 3D functional keypoint representation, enabling robots to generalize tool manipulation skills from a single human demonstration video to novel tools with the same function despite significant intra-function variations. With this formulation, we factorize FUNCTO into three stages: (1) functional keypoint extraction, (2) function-centric correspondence establishment, and (3) functional keypoint-based action planning. We evaluate FUNCTO against exiting modular OSIL methods and end-to-end behavioral cloning methods through real-robot experiments on diverse tool manipulation tasks. The results demonstrate the superiority of FUNCTO when generalizing to novel tools with intra-function geometric variations. More details are available at https://sites.google.com/view/functo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11744v1-abstract-full').style.display = 'none'; document.getElementById('2502.11744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11234">arXiv:2502.11234</a> <span> [<a href="https://arxiv.org/pdf/2502.11234">pdf</a>, <a href="https://arxiv.org/format/2502.11234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaskFlow: Discrete Flows For Flexible and Efficient Long Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fuest%2C+M">Michael Fuest</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11234v1-abstract-short" style="display: inline;"> Generating long, high-quality videos remains a challenge due to the complex interplay of spatial and temporal dynamics and hardware limitations. In this work, we introduce \textbf{MaskFlow}, a unified video generation framework that combines discrete representations with flow-matching to enable efficient generation of high-quality long videos. By leveraging a frame-level masking strategy during tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11234v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11234v1-abstract-full" style="display: none;"> Generating long, high-quality videos remains a challenge due to the complex interplay of spatial and temporal dynamics and hardware limitations. In this work, we introduce \textbf{MaskFlow}, a unified video generation framework that combines discrete representations with flow-matching to enable efficient generation of high-quality long videos. By leveraging a frame-level masking strategy during training, MaskFlow conditions on previously generated unmasked frames to generate videos with lengths ten times beyond that of the training sequences. MaskFlow does so very efficiently by enabling the use of fast Masked Generative Model (MGM)-style sampling and can be deployed in both fully autoregressive as well as full-sequence generation modes. We validate the quality of our method on the FaceForensics (FFS) and Deepmind Lab (DMLab) datasets and report Fr茅chet Video Distance (FVD) competitive with state-of-the-art approaches. We also provide a detailed analysis on the sampling efficiency of our method and demonstrate that MaskFlow can be applied to both timestep-dependent and timestep-independent models in a training-free manner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11234v1-abstract-full').style.display = 'none'; document.getElementById('2502.11234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11147">arXiv:2502.11147</a> <span> [<a href="https://arxiv.org/pdf/2502.11147">pdf</a>, <a href="https://arxiv.org/format/2502.11147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Long-Decoding Inference with Reasoning-Aware Attention Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junhao Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenrui Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weidong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenwen Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tiancheng Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhixia Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xusheng Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tao Xie</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yizhou Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11147v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated strong capabilities across various domains, with recent advancements in challenging reasoning tasks such as mathematics and programming. However, solving reasoning tasks often requires long decoding chains (of thoughts), which incur $O(N)$ time and memory consumption, where $N$ is the chain length. To mitigate $O(N)$ time and memory consumption, exist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11147v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11147v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11147v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated strong capabilities across various domains, with recent advancements in challenging reasoning tasks such as mathematics and programming. However, solving reasoning tasks often requires long decoding chains (of thoughts), which incur $O(N)$ time and memory consumption, where $N$ is the chain length. To mitigate $O(N)$ time and memory consumption, existing sparsity-based algorithms propose retaining only the most critical token's intermediate data (i.e., key-value cache) and discarding the rest. However, these existing algorithms struggle with the ``impossible trinity'' of accuracy, time, and memory. For example, the state-of-the-art algorithm, Quest, achieves high accuracy with $O(L)$ time but $O(N)$ memory ($L$ is the cache budget, $L \ll N$). To address this issue, in this paper, we identify a new attention pattern during the decode stage of reasoning tasks, where milestone tokens (analogous to lemmas in mathematical proofs) emerge, are utilized, and then become unimportant afterward. Based on this pattern, we propose a new algorithm named RaaS that identifies and retains milestone tokens only until they are no longer needed, achieving high accuracy with $O(L)$ time and $O(L)$ memory complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11147v1-abstract-full').style.display = 'none'; document.getElementById('2502.11147v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01450">arXiv:2502.01450</a> <span> [<a href="https://arxiv.org/pdf/2502.01450">pdf</a>, <a href="https://arxiv.org/format/2502.01450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Simulating Rumor Spreading in Social Networks using LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianrui Hu</a>, <a href="/search/cs?searchtype=author&query=Liakopoulos%2C+D">Dimitrios Liakopoulos</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiwen Wei</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a>, <a href="/search/cs?searchtype=author&query=Yadwadkar%2C+N+J">Neeraja J. Yadwadkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01450v1-abstract-short" style="display: inline;"> With the rise of social media, misinformation has become increasingly prevalent, fueled largely by the spread of rumors. This study explores the use of Large Language Model (LLM) agents within a novel framework to simulate and analyze the dynamics of rumor propagation across social networks. To this end, we design a variety of LLM-based agent types and construct four distinct network structures to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01450v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01450v1-abstract-full" style="display: none;"> With the rise of social media, misinformation has become increasingly prevalent, fueled largely by the spread of rumors. This study explores the use of Large Language Model (LLM) agents within a novel framework to simulate and analyze the dynamics of rumor propagation across social networks. To this end, we design a variety of LLM-based agent types and construct four distinct network structures to conduct these simulations. Our framework assesses the effectiveness of different network constructions and agent behaviors in influencing the spread of rumors. Our results demonstrate that the framework can simulate rumor spreading across more than one hundred agents in various networks with thousands of edges. The evaluations indicate that network structure, personas, and spreading schemes can significantly influence rumor dissemination, ranging from no spread to affecting 83\% of agents in iterations, thereby offering a realistic simulation of rumor spread in social networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01450v1-abstract-full').style.display = 'none'; document.getElementById('2502.01450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15186">arXiv:2501.15186</a> <span> [<a href="https://arxiv.org/pdf/2501.15186">pdf</a>, <a href="https://arxiv.org/format/2501.15186">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Iterative Deep Ritz Method for Monotone Elliptic Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianhao Hu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bangti Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengru Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15186v1-abstract-short" style="display: inline;"> In this work, we present a novel iterative deep Ritz method (IDRM) for solving a general class of elliptic problems. It is inspired by the iterative procedure for minimizing the loss during the training of the neural network, but at each step encodes the geometry of the underlying function space and incorporates a convex penalty to enhance the performance of the algorithm. The algorithm is applica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15186v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15186v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15186v1-abstract-full" style="display: none;"> In this work, we present a novel iterative deep Ritz method (IDRM) for solving a general class of elliptic problems. It is inspired by the iterative procedure for minimizing the loss during the training of the neural network, but at each step encodes the geometry of the underlying function space and incorporates a convex penalty to enhance the performance of the algorithm. The algorithm is applicable to elliptic problems involving a monotone operator (not necessarily of variational form) and does not impose any stringent regularity assumption on the solution. It improves several existing neural PDE solvers, e.g., physics informed neural network and deep Ritz method, in terms of the accuracy for the concerned class of elliptic problems. Further, we establish a convergence rate for the method using tools from geometry of Banach spaces and theory of monotone operators, and also analyze the learning error. To illustrate the effectiveness of the method, we present several challenging examples, including a comparative study with existing techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15186v1-abstract-full').style.display = 'none'; document.getElementById('2501.15186v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13111">arXiv:2501.13111</a> <span> [<a href="https://arxiv.org/pdf/2501.13111">pdf</a>, <a href="https://arxiv.org/format/2501.13111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> iServe: An Intent-based Serving System for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liakopoulos%2C+D">Dimitrios Liakopoulos</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianrui Hu</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+P">Prasoon Sinha</a>, <a href="/search/cs?searchtype=author&query=Yadwadkar%2C+N+J">Neeraja J. Yadwadkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13111v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are becoming ubiquitous across industries, where applications demand they fulfill diverse user intents. However, developers currently face the challenge of manually exploring numerous deployment configurations - combinations of parallelism and compression techniques that impact resource usage, latency, cost, and accuracy - to meet these intents. Assessing the impact of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13111v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13111v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are becoming ubiquitous across industries, where applications demand they fulfill diverse user intents. However, developers currently face the challenge of manually exploring numerous deployment configurations - combinations of parallelism and compression techniques that impact resource usage, latency, cost, and accuracy - to meet these intents. Assessing the impact of these configurations on user metrics requires extensive, costly profiling for each model. Existing approaches avoid this expense by using fixed, static configurations, but this often leads to sub-optimal performance and higher costs. Moreover, none of these solutions dynamically adapt to changing user intents to balance latency and cost, effectively. We present iServe, an automated, intent-based system for distributed LLM inference. Instead of manually selecting deployment configurations, developers simply specify their intent - such as minimizing latency, reducing cost, or meeting specific targets for either. iServe introduces fingerprints, lightweight representations of LLMs, to efficiently estimate how different configurations impact latency and memory usage. Based on these insights and GPU availability, iServe dynamically selects the optimal configuration to align with the user's intent. For various LLMs and query arrival rates, iServe best meets user intents compared to state-of-the-art systems by reducing latency by 77.62% and SLO violations by 7.09x while improving GPU throughput by 4.72x. Moreover, iServe's fingerprint-based profiling reduces profiling cost by 6.05x (GPU-hours) compared to baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13111v1-abstract-full').style.display = 'none'; document.getElementById('2501.13111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 24 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12380">arXiv:2501.12380</a> <span> [<a href="https://arxiv.org/pdf/2501.12380">pdf</a>, <a href="https://arxiv.org/format/2501.12380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MMVU: Measuring Expert-Level Multi-Discipline Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilun Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lujing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+G">Guo Gan</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yitao Long</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tongyan Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuhan Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junyang Song</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhijian Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengye Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+W">Weifeng Pan</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Z">Ziyao Shangguan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangru Tang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhenwen Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yixin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12380v1-abstract-short" style="display: inline;"> We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark for evaluating foundation models in video understanding. MMVU includes 3,000 expert-annotated questions spanning 27 subjects across four core disciplines: Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to prior benchmarks, MMVU features three key advancements. First, it challenges models to ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12380v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12380v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12380v1-abstract-full" style="display: none;"> We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark for evaluating foundation models in video understanding. MMVU includes 3,000 expert-annotated questions spanning 27 subjects across four core disciplines: Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to prior benchmarks, MMVU features three key advancements. First, it challenges models to apply domain-specific knowledge and perform expert-level reasoning to analyze specialized-domain videos, moving beyond the basic visual perception typically assessed in current video benchmarks. Second, each example is annotated by human experts from scratch. We implement strict data quality controls to ensure the high quality of the dataset. Finally, each example is enriched with expert-annotated reasoning rationals and relevant domain knowledge, facilitating in-depth analysis. We conduct an extensive evaluation of 32 frontier multimodal foundation models on MMVU. The latest System-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest performance among the tested models. However, they still fall short of matching human expertise. Through in-depth error analyses and case studies, we offer actionable insights for future advancements in expert-level, knowledge-intensive video understanding for specialized domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12380v1-abstract-full').style.display = 'none'; document.getElementById('2501.12380v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11231">arXiv:2501.11231</a> <span> [<a href="https://arxiv.org/pdf/2501.11231">pdf</a>, <a href="https://arxiv.org/format/2501.11231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KPL: Training-Free Medical Knowledge Mining of Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiawei Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+T">Joey Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuozhu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11231v1-abstract-short" style="display: inline;"> Visual Language Models such as CLIP excel in image recognition due to extensive image-text pre-training. However, applying the CLIP inference in zero-shot classification, particularly for medical image diagnosis, faces challenges due to: 1) the inadequacy of representing image classes solely with single category names; 2) the modal gap between the visual and text spaces generated by CLIP encoders.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11231v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11231v1-abstract-full" style="display: none;"> Visual Language Models such as CLIP excel in image recognition due to extensive image-text pre-training. However, applying the CLIP inference in zero-shot classification, particularly for medical image diagnosis, faces challenges due to: 1) the inadequacy of representing image classes solely with single category names; 2) the modal gap between the visual and text spaces generated by CLIP encoders. Despite attempts to enrich disease descriptions with large language models, the lack of class-specific knowledge often leads to poor performance. In addition, empirical evidence suggests that existing proxy learning methods for zero-shot image classification on natural image datasets exhibit instability when applied to medical datasets. To tackle these challenges, we introduce the Knowledge Proxy Learning (KPL) to mine knowledge from CLIP. KPL is designed to leverage CLIP's multimodal understandings for medical image classification through Text Proxy Optimization and Multimodal Proxy Learning. Specifically, KPL retrieves image-relevant knowledge descriptions from the constructed knowledge-enhanced base to enrich semantic text proxies. It then harnesses input images and these descriptions, encoded via CLIP, to stably generate multimodal proxies that boost the zero-shot classification performance. Extensive experiments conducted on both medical and natural image datasets demonstrate that KPL enables effective zero-shot image classification, outperforming all baselines. These findings highlight the great potential in this paradigm of mining knowledge from CLIP for medical image classification and broader areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11231v1-abstract-full').style.display = 'none'; document.getElementById('2501.11231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI(Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10252">arXiv:2501.10252</a> <span> [<a href="https://arxiv.org/pdf/2501.10252">pdf</a>, <a href="https://arxiv.org/format/2501.10252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MILCOM61039.2024.10773637">10.1109/MILCOM61039.2024.10773637 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dynamic Routing in Space-Ground Integrated Quantum Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianjie Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jindi Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10252v1-abstract-short" style="display: inline;"> Quantum networks emerge as fundamental frameworks for addressing various large-scale problems. There are two primary architectures: space-based quantum networks, which deploy satellites with free space channels to interconnect users, and ground-based quantum networks, which utilize optical fibers to interconnect users. In this paper, we explore space-ground integrated quantum networks that incorpo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10252v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10252v1-abstract-full" style="display: none;"> Quantum networks emerge as fundamental frameworks for addressing various large-scale problems. There are two primary architectures: space-based quantum networks, which deploy satellites with free space channels to interconnect users, and ground-based quantum networks, which utilize optical fibers to interconnect users. In this paper, we explore space-ground integrated quantum networks that incorporate both satellites and optical fibers into the infrastructure. This integrated network features three forms of communication: using only free space links, only ground links, or a hybrid usage of free space and ground links. We formulate the routing problem in space-ground integrated quantum networks as an integer programming and propose two solutions: using a linear relaxation and a greedy algorithm. The linear relaxation algorithm allows timely scheduling of additional entanglement purification, whereas the greedy algorithm enables quick scheduling. Simulation results demonstrate their effective balancing between network throughput and communication fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10252v1-abstract-full').style.display = 'none'; document.getElementById('2501.10252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Full version of paper submitted to 2024 IEEE Military Communications Conference (MILCOM)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> MILCOM 2024 - 2024 IEEE Military Communications Conference (MILCOM) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10062">arXiv:2501.10062</a> <span> [<a href="https://arxiv.org/pdf/2501.10062">pdf</a>, <a href="https://arxiv.org/format/2501.10062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OMoE: Diversifying Mixture of Low-Rank Adaptation by Orthogonal Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jinyuan Feng</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Zhiqiang Pu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyi Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongmin Li</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+X">Xiaolin Ai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huimu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10062v1-abstract-short" style="display: inline;"> Building mixture-of-experts (MoE) architecture for Low-rank adaptation (LoRA) is emerging as a potential direction in parameter-efficient fine-tuning (PEFT) for its modular design and remarkable performance. However, simply stacking the number of experts cannot guarantee significant improvement. In this work, we first conduct qualitative analysis to indicate that experts collapse to similar repres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10062v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10062v1-abstract-full" style="display: none;"> Building mixture-of-experts (MoE) architecture for Low-rank adaptation (LoRA) is emerging as a potential direction in parameter-efficient fine-tuning (PEFT) for its modular design and remarkable performance. However, simply stacking the number of experts cannot guarantee significant improvement. In this work, we first conduct qualitative analysis to indicate that experts collapse to similar representations in vanilla MoE, limiting the capacity of modular design and computational efficiency. Ulteriorly, Our analysis reveals that the performance of previous MoE variants maybe limited by a lack of diversity among experts. Motivated by these findings, we propose Orthogonal Mixture-of-Experts (OMoE), a resource-efficient MoE variant that trains experts in an orthogonal manner to promote diversity. In OMoE, a Gram-Schmidt process is leveraged to enforce that the experts' representations lie within the Stiefel manifold. By applying orthogonal constraints directly to the architecture, OMoE keeps the learning objective unchanged, without compromising optimality. Our method is simple and alleviates memory bottlenecks, as it incurs minimal experts compared to vanilla MoE models. Experiments on diverse commonsense reasoning benchmarks demonstrate that OMoE can consistently achieve stable and efficient performance improvement when compared with the state-of-the-art methods while significantly reducing the number of required experts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10062v1-abstract-full').style.display = 'none'; document.getElementById('2501.10062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08760">arXiv:2501.08760</a> <span> [<a href="https://arxiv.org/pdf/2501.08760">pdf</a>, <a href="https://arxiv.org/format/2501.08760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Leveraging LLM Agents for Translating Network Configurations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunze Wei</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaohui Xie</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yiwei Zuo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianshuo Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+K">Kaiwen Chi</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yong Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08760v1-abstract-short" style="display: inline;"> Configuration translation is a critical and frequent task in network operations. When a network device is damaged or outdated, administrators need to replace it to maintain service continuity. The replacement devices may originate from different vendors, necessitating configuration translation to ensure seamless network operation. However, translating configurations manually is a labor-intensive a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08760v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08760v1-abstract-full" style="display: none;"> Configuration translation is a critical and frequent task in network operations. When a network device is damaged or outdated, administrators need to replace it to maintain service continuity. The replacement devices may originate from different vendors, necessitating configuration translation to ensure seamless network operation. However, translating configurations manually is a labor-intensive and error-prone process. In this paper, we propose an intent-based framework for translating network configuration with Large Language Model (LLM) Agents. The core of our approach is an Intent-based Retrieval Augmented Generation (IRAG) module that systematically splits a configuration file into fragments, extracts intents, and generates accurate translations. We also design a two-stage verification method to validate the syntax and semantics correctness of the translated configurations. We implement and evaluate the proposed method on real-world network configurations. Experimental results show that our method achieves 97.74% syntax correctness, outperforming state-of-the-art methods in translation accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08760v1-abstract-full').style.display = 'none'; document.getElementById('2501.08760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08168">arXiv:2501.08168</a> <span> [<a href="https://arxiv.org/pdf/2501.08168">pdf</a>, <a href="https://arxiv.org/format/2501.08168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LeapVAD: A Leap in Autonomous Driving via Cognitive Perception and Dual-Process Thinking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yukai Ma</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tiantian Wei</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+N">Naiting Zhong</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jianbiao Mei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Licheng Wen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuemeng Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08168v1-abstract-short" style="display: inline;"> While autonomous driving technology has made remarkable strides, data-driven approaches still struggle with complex scenarios due to their limited reasoning capabilities. Meanwhile, knowledge-driven autonomous driving systems have evolved considerably with the popularization of visual language models. In this paper, we propose LeapVAD, a novel method based on cognitive perception and dual-process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08168v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08168v1-abstract-full" style="display: none;"> While autonomous driving technology has made remarkable strides, data-driven approaches still struggle with complex scenarios due to their limited reasoning capabilities. Meanwhile, knowledge-driven autonomous driving systems have evolved considerably with the popularization of visual language models. In this paper, we propose LeapVAD, a novel method based on cognitive perception and dual-process thinking. Our approach implements a human-attentional mechanism to identify and focus on critical traffic elements that influence driving decisions. By characterizing these objects through comprehensive attributes - including appearance, motion patterns, and associated risks - LeapVAD achieves more effective environmental representation and streamlines the decision-making process. Furthermore, LeapVAD incorporates an innovative dual-process decision-making module miming the human-driving learning process. The system consists of an Analytic Process (System-II) that accumulates driving experience through logical reasoning and a Heuristic Process (System-I) that refines this knowledge via fine-tuning and few-shot learning. LeapVAD also includes reflective mechanisms and a growing memory bank, enabling it to learn from past mistakes and continuously improve its performance in a closed-loop environment. To enhance efficiency, we develop a scene encoder network that generates compact scene representations for rapid retrieval of relevant driving experiences. Extensive evaluations conducted on two leading autonomous driving simulators, CARLA and DriveArena, demonstrate that LeapVAD achieves superior performance compared to camera-only approaches despite limited training data. Comprehensive ablation studies further emphasize its effectiveness in continuous learning and domain adaptation. Project page: https://pjlab-adg.github.io/LeapVAD/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08168v1-abstract-full').style.display = 'none'; document.getElementById('2501.08168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06590">arXiv:2501.06590</a> <span> [<a href="https://arxiv.org/pdf/2501.06590">pdf</a>, <a href="https://arxiv.org/format/2501.06590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangru Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyu Hu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Muyang Ye</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yanjun Shao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xunjian Yin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+S">Siru Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wangchunshu Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilun Zhao</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a>, <a href="/search/cs?searchtype=author&query=Gerstein%2C+M">Mark Gerstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06590v1-abstract-short" style="display: inline;"> Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06590v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06590v1-abstract-full" style="display: none;"> Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we present ChemAgent, a novel framework designed to improve the performance of LLMs through a dynamic, self-updating library. This library is developed by decomposing chemical tasks into sub-tasks and compiling these sub-tasks into a structured collection that can be referenced for future queries. Then, when presented with a new problem, ChemAgent retrieves and refines pertinent information from the library, which we call memory, facilitating effective task decomposition and the generation of solutions. Our method designs three types of memory and a library-enhanced reasoning component, enabling LLMs to improve over time through experience. Experimental results on four chemical reasoning datasets from SciBench demonstrate that ChemAgent achieves performance gains of up to 46% (GPT-4), significantly outperforming existing methods. Our findings suggest substantial potential for future applications, including tasks such as drug discovery and materials science. Our code can be found at https://github.com/gersteinlab/chemagent <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06590v1-abstract-full').style.display = 'none'; document.getElementById('2501.06590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04765">arXiv:2501.04765</a> <span> [<a href="https://arxiv.org/pdf/2501.04765">pdf</a>, <a href="https://arxiv.org/format/2501.04765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TREAD: Token Routing for Efficient Architecture-agnostic Diffusion Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Krause%2C+F">Felix Krause</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+T">Timy Phan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04765v1-abstract-short" style="display: inline;"> Diffusion models have emerged as the mainstream approach for visual generation. However, these models usually suffer from sample inefficiency and high training costs. This issue is particularly pronounced in the standard diffusion transformer architecture due to its quadratic complexity relative to input length. Recent works have addressed this by reducing the number of tokens processed in the mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04765v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04765v1-abstract-full" style="display: none;"> Diffusion models have emerged as the mainstream approach for visual generation. However, these models usually suffer from sample inefficiency and high training costs. This issue is particularly pronounced in the standard diffusion transformer architecture due to its quadratic complexity relative to input length. Recent works have addressed this by reducing the number of tokens processed in the model, often through masking. In contrast, this work aims to improve the training efficiency of the diffusion backbone by using predefined routes that store this information until it is reintroduced to deeper layers of the model, rather than discarding these tokens entirely. Further, we combine multiple routes and introduce an adapted auxiliary loss that accounts for all applied routes. Our method is not limited to the common transformer-based model - it can also be applied to state-space models. Unlike most current approaches, TREAD achieves this without architectural modifications. Finally, we show that our method reduces the computational cost and simultaneously boosts model performance on the standard benchmark ImageNet-1K 256 x 256 in class-conditional synthesis. Both of these benefits multiply to a convergence speedup of 9.55x at 400K training iterations compared to DiT and 25.39x compared to the best benchmark performance of DiT at 7M training iterations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04765v1-abstract-full').style.display = 'none'; document.getElementById('2501.04765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00880">arXiv:2501.00880</a> <span> [<a href="https://arxiv.org/pdf/2501.00880">pdf</a>, <a href="https://arxiv.org/format/2501.00880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving Autoregressive Visual Generation with Cluster-Oriented Token Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Teng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+J">Jieyu Weng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yabiao Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xianfang Zeng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhucun Xue</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lizhuang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00880v1-abstract-short" style="display: inline;"> Employing LLMs for visual generation has recently become a research focus. However, the existing methods primarily transfer the LLM architecture to visual generation but rarely investigate the fundamental differences between language and vision. This oversight may lead to suboptimal utilization of visual generation capabilities within the LLM framework. In this paper, we explore the characteristic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00880v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00880v1-abstract-full" style="display: none;"> Employing LLMs for visual generation has recently become a research focus. However, the existing methods primarily transfer the LLM architecture to visual generation but rarely investigate the fundamental differences between language and vision. This oversight may lead to suboptimal utilization of visual generation capabilities within the LLM framework. In this paper, we explore the characteristics of visual embedding space under the LLM framework and discover that the correlation between visual embeddings can help achieve more stable and robust generation results. We present IAR, an Improved AutoRegressive Visual Generation Method that enhances the training efficiency and generation quality of LLM-based visual generation models. Firstly, we propose a Codebook Rearrangement strategy that uses balanced k-means clustering algorithm to rearrange the visual codebook into clusters, ensuring high similarity among visual features within each cluster. Leveraging the rearranged codebook, we propose a Cluster-oriented Cross-entropy Loss that guides the model to correctly predict the cluster where the token is located. This approach ensures that even if the model predicts the wrong token index, there is a high probability the predicted token is located in the correct cluster, which significantly enhances the generation quality and robustness. Extensive experiments demonstrate that our method consistently enhances the model training efficiency and performance from 100M to 1.4B, reducing the training time by half while achieving the same FID. Additionally, our approach can be applied to various LLM-based visual generation models and adheres to the scaling law, providing a promising direction for future research in LLM-based visual generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00880v1-abstract-full').style.display = 'none'; document.getElementById('2501.00880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19037">arXiv:2412.19037</a> <span> [<a href="https://arxiv.org/pdf/2412.19037">pdf</a>, <a href="https://arxiv.org/format/2412.19037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CL-attack: Textual Backdoor Attacks via Cross-Lingual Triggers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jingyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyi Hu</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+T">Tianshuo Cong</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinlei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19037v1-abstract-short" style="display: inline;"> Backdoor attacks significantly compromise the security of large language models by triggering them to output specific and controlled content. Currently, triggers for textual backdoor attacks fall into two categories: fixed-token triggers and sentence-pattern triggers. However, the former are typically easy to identify and filter, while the latter, such as syntax and style, do not apply to all orig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19037v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19037v1-abstract-full" style="display: none;"> Backdoor attacks significantly compromise the security of large language models by triggering them to output specific and controlled content. Currently, triggers for textual backdoor attacks fall into two categories: fixed-token triggers and sentence-pattern triggers. However, the former are typically easy to identify and filter, while the latter, such as syntax and style, do not apply to all original samples and may lead to semantic shifts. In this paper, inspired by cross-lingual (CL) prompts of LLMs in real-world scenarios, we propose a higher-dimensional trigger method at the paragraph level, namely CL-attack. CL-attack injects the backdoor by using texts with specific structures that incorporate multiple languages, thereby offering greater stealthiness and universality compared to existing backdoor attack techniques. Extensive experiments on different tasks and model architectures demonstrate that CL-attack can achieve nearly 100% attack success rate with a low poisoning rate in both classification and generation tasks. We also empirically show that the CL-attack is more robust against current major defense methods compared to baseline backdoor attacks. Additionally, to mitigate CL-attack, we further develop a new defense called TranslateDefense, which can partially mitigate the impact of CL-attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19037v1-abstract-full').style.display = 'none'; document.getElementById('2412.19037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted to AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16875">arXiv:2412.16875</a> <span> [<a href="https://arxiv.org/pdf/2412.16875">pdf</a>, <a href="https://arxiv.org/format/2412.16875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Swept Volume-Aware Trajectory Planning and MPC Tracking for Multi-Axle Swerve-Drive AMRs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxin Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+R">Ruofei Bai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinghang Xu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yuwen Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fen Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lihua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16875v2-abstract-short" style="display: inline;"> Multi-axle autonomous mobile robots (AMRs) are set to revolutionize the future of robotics in logistics. As the backbone of next-generation solutions, these robots face a critical challenge: managing and minimizing the swept volume during turns while maintaining precise control. Traditional systems designed for standard vehicles often struggle with the complex dynamics of multi-axle configurations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16875v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16875v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16875v2-abstract-full" style="display: none;"> Multi-axle autonomous mobile robots (AMRs) are set to revolutionize the future of robotics in logistics. As the backbone of next-generation solutions, these robots face a critical challenge: managing and minimizing the swept volume during turns while maintaining precise control. Traditional systems designed for standard vehicles often struggle with the complex dynamics of multi-axle configurations, leading to inefficiency and increased safety risk in confined spaces. Our innovative framework overcomes these limitations by combining swept volume minimization with Signed Distance Field (SDF) path planning and model predictive control (MPC) for independent wheel steering. This approach not only plans paths with an awareness of the swept volume but actively minimizes it in real-time, allowing each axle to follow a precise trajectory while significantly reducing the space the vehicle occupies. By predicting future states and adjusting the turning radius of each wheel, our method enhances both maneuverability and safety, even in the most constrained environments. Unlike previous works, our solution goes beyond basic path calculation and tracking, offering real-time path optimization with minimal swept volume and efficient individual axle control. To our knowledge, this is the first comprehensive approach to tackle these challenges, delivering life-saving improvements in control, efficiency, and safety for multi-axle AMRs. Furthermore, we will open-source our work to foster collaboration and enable others to advance safer, more efficient autonomous systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16875v2-abstract-full').style.display = 'none'; document.getElementById('2412.16875v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper Accepted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11917">arXiv:2412.11917</a> <span> [<a href="https://arxiv.org/pdf/2412.11917">pdf</a>, <a href="https://arxiv.org/format/2412.11917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Does VLM Classification Benefit from LLM Description Semantics? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Rietdorf%2C+L">Lennart Rietdorf</a>, <a href="/search/cs?searchtype=author&query=Kotovenko%2C+D">Dmytro Kotovenko</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11917v3-abstract-short" style="display: inline;"> Accurately describing images with text is a foundation of explainable AI. Vision-Language Models (VLMs) like CLIP have recently addressed this by aligning images and texts in a shared embedding space, expressing semantic similarities between vision and language embeddings. VLM classification can be improved with descriptions generated by Large Language Models (LLMs). However, it is difficult to de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11917v3-abstract-full').style.display = 'inline'; document.getElementById('2412.11917v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11917v3-abstract-full" style="display: none;"> Accurately describing images with text is a foundation of explainable AI. Vision-Language Models (VLMs) like CLIP have recently addressed this by aligning images and texts in a shared embedding space, expressing semantic similarities between vision and language embeddings. VLM classification can be improved with descriptions generated by Large Language Models (LLMs). However, it is difficult to determine the contribution of actual description semantics, as the performance gain may also stem from a semantic-agnostic ensembling effect, where multiple modified text prompts act as a noisy test-time augmentation for the original one. We propose an alternative evaluation scenario to decide if a performance boost of LLM-generated descriptions is caused by such a noise augmentation effect or rather by genuine description semantics. The proposed scenario avoids noisy test-time augmentation and ensures that genuine, distinctive descriptions cause the performance boost. Furthermore, we propose a training-free method for selecting discriminative descriptions that work independently of classname-ensembling effects. Our approach identifies descriptions that effectively differentiate classes within a local CLIP label neighborhood, improving classification accuracy across seven datasets. Additionally, we provide insights into the explainability of description-based image classification with VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11917v3-abstract-full').style.display = 'none'; document.getElementById('2412.11917v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI-25 (extended version), Code: https://github.com/CompVis/DisCLIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10430">arXiv:2412.10430</a> <span> [<a href="https://arxiv.org/pdf/2412.10430">pdf</a>, <a href="https://arxiv.org/format/2412.10430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Cross-Domain Regression for Fine-grained 3D Game Character Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qi Wen</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xiang Wen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siqi Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bingfeng Han</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianlei Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10430v1-abstract-short" style="display: inline;"> With the rise of the ``metaverse'' and the rapid development of games, it has become more and more critical to reconstruct characters in the virtual world faithfully. The immersive experience is one of the most central themes of the ``metaverse'', while the reducibility of the avatar is the crucial point. Meanwhile, the game is the carrier of the metaverse, in which players can freely edit the fac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10430v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10430v1-abstract-full" style="display: none;"> With the rise of the ``metaverse'' and the rapid development of games, it has become more and more critical to reconstruct characters in the virtual world faithfully. The immersive experience is one of the most central themes of the ``metaverse'', while the reducibility of the avatar is the crucial point. Meanwhile, the game is the carrier of the metaverse, in which players can freely edit the facial appearance of the game character. In this paper, we propose a simple but powerful cross-domain framework that can reconstruct fine-grained 3D game characters from single-view images in an end-to-end manner. Different from the previous methods, which do not resolve the cross-domain gap, we propose an effective regressor that can greatly reduce the discrepancy between the real-world domain and the game domain. To figure out the drawbacks of no ground truth, our unsupervised framework has accomplished the knowledge transfer of the target domain. Additionally, an innovative contrastive loss is proposed to solve the instance-wise disparity, which keeps the person-specific details of the reconstructed character. In contrast, an auxiliary 3D identity-aware extractor is activated to make the results of our model more impeccable. Then a large set of physically meaningful facial parameters is generated robustly and exquisitely. Experiments demonstrate that our method yields state-of-the-art performance in 3D game character reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10430v1-abstract-full').style.display = 'none'; document.getElementById('2412.10430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07371">arXiv:2412.07371</a> <span> [<a href="https://arxiv.org/pdf/2412.07371">pdf</a>, <a href="https://arxiv.org/format/2412.07371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PRM: Photometric Stereo based Large Reconstruction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+W">Wenhang Ge</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiantao Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guibao Shen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiawei Feng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinli Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07371v1-abstract-short" style="display: inline;"> We propose PRM, a novel photometric stereo based large reconstruction model to reconstruct high-quality meshes with fine-grained local details. Unlike previous large reconstruction models that prepare images under fixed and simple lighting as both input and supervision, PRM renders photometric stereo images by varying materials and lighting for the purposes, which not only improves the precise loc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07371v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07371v1-abstract-full" style="display: none;"> We propose PRM, a novel photometric stereo based large reconstruction model to reconstruct high-quality meshes with fine-grained local details. Unlike previous large reconstruction models that prepare images under fixed and simple lighting as both input and supervision, PRM renders photometric stereo images by varying materials and lighting for the purposes, which not only improves the precise local details by providing rich photometric cues but also increases the model robustness to variations in the appearance of input images. To offer enhanced flexibility of images rendering, we incorporate a real-time physically-based rendering (PBR) method and mesh rasterization for online images rendering. Moreover, in employing an explicit mesh as our 3D representation, PRM ensures the application of differentiable PBR, which supports the utilization of multiple photometric supervisions and better models the specular color for high-quality geometry optimization. Our PRM leverages photometric stereo images to achieve high-quality reconstructions with fine-grained local details, even amidst sophisticated image appearances. Extensive experiments demonstrate that PRM significantly outperforms other models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07371v1-abstract-full').style.display = 'none'; document.getElementById('2412.07371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://wenhangge.github.io/PRM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06787">arXiv:2412.06787</a> <span> [<a href="https://arxiv.org/pdf/2412.06787">pdf</a>, <a href="https://arxiv.org/format/2412.06787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> [MASK] is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06787v2-abstract-short" style="display: inline;"> In generative models, two paradigms have gained attraction in various applications: next-set prediction-based Masked Generative Models and next-noise prediction-based Non-Autoregressive Models, e.g., Diffusion Models. In this work, we propose using discrete-state models to connect them and explore their scalability in the vision domain. First, we conduct a step-by-step analysis in a unified design… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06787v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06787v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06787v2-abstract-full" style="display: none;"> In generative models, two paradigms have gained attraction in various applications: next-set prediction-based Masked Generative Models and next-noise prediction-based Non-Autoregressive Models, e.g., Diffusion Models. In this work, we propose using discrete-state models to connect them and explore their scalability in the vision domain. First, we conduct a step-by-step analysis in a unified design space across two types of models including timestep-independence, noise schedule, temperature, guidance strength, etc in a scalable manner. Second, we re-cast typical discriminative tasks, e.g., image segmentation, as an unmasking process from [MASK] tokens on a discrete-state model. This enables us to perform various sampling processes, including flexible conditional sampling by only training once to model the joint distribution. All aforementioned explorations lead to our framework named Discrete Interpolants, which enables us to achieve state-of-the-art or competitive performance compared to previous discrete-state based methods in various benchmarks, like ImageNet256, MS COCO, and video dataset FaceForensics. In summary, by leveraging [MASK] in discrete-state models, we can bridge Masked Generative and Non-autoregressive Diffusion models, as well as generative and discriminative tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06787v2-abstract-full').style.display = 'none'; document.getElementById('2412.06787v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report (WIP), Project Page(code, model, dataset): https://compvis.github.io/mask/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06674">arXiv:2412.06674</a> <span> [<a href="https://arxiv.org/pdf/2412.06674">pdf</a>, <a href="https://arxiv.org/format/2412.06674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EMOv2: Pushing 5M Vision Model Frontier </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Teng Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Haoyang He</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhucun Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yabiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06674v1-abstract-short" style="display: inline;"> This work focuses on developing parameter-efficient and lightweight models for dense predictions while trading off parameters, FLOPs, and performance. Our goal is to set up the new frontier of the 5M magnitude lightweight model on various downstream tasks. Inverted Residual Block (IRB) serves as the infrastructure for lightweight CNNs, but no counterparts have been recognized by attention-based de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06674v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06674v1-abstract-full" style="display: none;"> This work focuses on developing parameter-efficient and lightweight models for dense predictions while trading off parameters, FLOPs, and performance. Our goal is to set up the new frontier of the 5M magnitude lightweight model on various downstream tasks. Inverted Residual Block (IRB) serves as the infrastructure for lightweight CNNs, but no counterparts have been recognized by attention-based design. Our work rethinks the lightweight infrastructure of efficient IRB and practical components in Transformer from a unified perspective, extending CNN-based IRB to attention-based models and abstracting a one-residual Meta Mobile Block (MMBlock) for lightweight model design. Following neat but effective design criterion, we deduce a modern Improved Inverted Residual Mobile Block (i2RMB) and improve a hierarchical Efficient MOdel (EMOv2) with no elaborate complex structures. Considering the imperceptible latency for mobile users when downloading models under 4G/5G bandwidth and ensuring model performance, we investigate the performance upper limit of lightweight models with a magnitude of 5M. Extensive experiments on various vision recognition, dense prediction, and image generation tasks demonstrate the superiority of our EMOv2 over state-of-the-art methods, e.g., EMOv2-1M/2M/5M achieve 72.3, 75.8, and 79.4 Top-1 that surpass equal-order CNN-/Attention-based models significantly. At the same time, EMOv2-5M equipped RetinaNet achieves 41.5 mAP for object detection tasks that surpasses the previous EMO-5M by +2.6. When employing the more robust training recipe, our EMOv2-5M eventually achieves 82.9 Top-1 accuracy, which elevates the performance of 5M magnitude models to a new level. Code is available at https://github.com/zhangzjn/EMOv2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06674v1-abstract-full').style.display = 'none'; document.getElementById('2412.06674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04037">arXiv:2412.04037</a> <span> [<a href="https://arxiv.org/pdf/2412.04037">pdf</a>, <a href="https://arxiv.org/format/2412.04037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> INFP: Audio-Driven Interactive Head Generation in Dyadic Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongming Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Z">Zhengkun Rong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianshu Hu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shuang Liang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zhipeng Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04037v1-abstract-short" style="display: inline;"> Imagine having a conversation with a socially intelligent agent. It can attentively listen to your words and offer visual and linguistic feedback promptly. This seamless interaction allows for multiple rounds of conversation to flow smoothly and naturally. In pursuit of actualizing it, we propose INFP, a novel audio-driven head generation framework for dyadic interaction. Unlike previous head gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04037v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04037v1-abstract-full" style="display: none;"> Imagine having a conversation with a socially intelligent agent. It can attentively listen to your words and offer visual and linguistic feedback promptly. This seamless interaction allows for multiple rounds of conversation to flow smoothly and naturally. In pursuit of actualizing it, we propose INFP, a novel audio-driven head generation framework for dyadic interaction. Unlike previous head generation works that only focus on single-sided communication, or require manual role assignment and explicit role switching, our model drives the agent portrait dynamically alternates between speaking and listening state, guided by the input dyadic audio. Specifically, INFP comprises a Motion-Based Head Imitation stage and an Audio-Guided Motion Generation stage. The first stage learns to project facial communicative behaviors from real-life conversation videos into a low-dimensional motion latent space, and use the motion latent codes to animate a static image. The second stage learns the mapping from the input dyadic audio to motion latent codes through denoising, leading to the audio-driven head generation in interactive scenarios. To facilitate this line of research, we introduce DyConv, a large scale dataset of rich dyadic conversations collected from the Internet. Extensive experiments and visualizations demonstrate superior performance and effectiveness of our method. Project Page: https://grisoon.github.io/INFP/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04037v1-abstract-full').style.display = 'none'; document.getElementById('2412.04037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03814">arXiv:2412.03814</a> <span> [<a href="https://arxiv.org/pdf/2412.03814">pdf</a>, <a href="https://arxiv.org/format/2412.03814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Real&Synthetic Dataset and Linear Attention in Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuzhen Du</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Teng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R+Y+C">Ran Yi Chengming Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kai Wu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yabiao Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lizhuang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03814v2-abstract-short" style="display: inline;"> Image restoration (IR) aims to recover high-quality images from degraded inputs, with recent deep learning advancements significantly enhancing performance. However, existing methods lack a unified training benchmark for iterations and configurations. We also identify a bias in image complexity distributions between commonly used IR training and testing datasets, resulting in suboptimal restoratio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03814v2-abstract-full').style.display = 'inline'; document.getElementById('2412.03814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03814v2-abstract-full" style="display: none;"> Image restoration (IR) aims to recover high-quality images from degraded inputs, with recent deep learning advancements significantly enhancing performance. However, existing methods lack a unified training benchmark for iterations and configurations. We also identify a bias in image complexity distributions between commonly used IR training and testing datasets, resulting in suboptimal restoration outcomes. To address this, we introduce a large-scale IR dataset called ReSyn, which employs a novel image filtering method based on image complexity to ensure a balanced distribution and includes both real and AIGC synthetic images. We establish a unified training standard that specifies iterations and configurations for image restoration models, focusing on measuring model convergence and restoration capability. Additionally, we enhance transformer-based image restoration models using linear attention mechanisms by proposing RWKV-IR, which integrates linear complexity RWKV into the transformer structure, allowing for both global and local receptive fields. Instead of directly using Vision-RWKV, we replace the original Q-Shift in RWKV with a Depth-wise Convolution shift to better model local dependencies, combined with Bi-directional attention for comprehensive linear attention. We also introduce a Cross-Bi-WKV module that merges two Bi-WKV modules with different scanning orders for balanced horizontal and vertical attention. Extensive experiments validate the effectiveness of our RWKV-IR model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03814v2-abstract-full').style.display = 'none'; document.getElementById('2412.03814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03512">arXiv:2412.03512</a> <span> [<a href="https://arxiv.org/pdf/2412.03512">pdf</a>, <a href="https://arxiv.org/format/2412.03512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distillation of Diffusion Features for Semantic Correspondence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fundel%2C+F">Frank Fundel</a>, <a href="/search/cs?searchtype=author&query=Schusterbauer%2C+J">Johannes Schusterbauer</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03512v1-abstract-short" style="display: inline;"> Semantic correspondence, the task of determining relationships between different parts of images, underpins various applications including 3D reconstruction, image-to-image translation, object tracking, and visual place recognition. Recent studies have begun to explore representations learned in large generative image models for semantic correspondence, demonstrating promising results. Building on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03512v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03512v1-abstract-full" style="display: none;"> Semantic correspondence, the task of determining relationships between different parts of images, underpins various applications including 3D reconstruction, image-to-image translation, object tracking, and visual place recognition. Recent studies have begun to explore representations learned in large generative image models for semantic correspondence, demonstrating promising results. Building on this progress, current state-of-the-art methods rely on combining multiple large models, resulting in high computational demands and reduced efficiency. In this work, we address this challenge by proposing a more computationally efficient approach. We propose a novel knowledge distillation technique to overcome the problem of reduced efficiency. We show how to use two large vision foundation models and distill the capabilities of these complementary models into one smaller model that maintains high accuracy at reduced computational cost. Furthermore, we demonstrate that by incorporating 3D data, we are able to further improve performance, without the need for human-annotated correspondences. Overall, our empirical results demonstrate that our distilled model with 3D data augmentation achieves performance superior to current state-of-the-art methods while significantly reducing computational load and enhancing practicality for real-world applications, such as semantic video correspondence. Our code and weights are publicly available on our project page. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03512v1-abstract-full').style.display = 'none'; document.getElementById('2412.03512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2025, Page: https://compvis.github.io/distilldift</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02632">arXiv:2412.02632</a> <span> [<a href="https://arxiv.org/pdf/2412.02632">pdf</a>, <a href="https://arxiv.org/format/2412.02632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scaling Image Tokenizers with Grouped Spherical Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiangtao Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a>, <a href="/search/cs?searchtype=author&query=Briq%2C+R">Rania Briq</a>, <a href="/search/cs?searchtype=author&query=Kesselheim%2C+S">Stefan Kesselheim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02632v2-abstract-short" style="display: inline;"> Vision tokenizers have gained a lot of attraction due to their scalability and compactness; previous works depend on old-school GAN-based hyperparameters, biased comparisons, and a lack of comprehensive analysis of the scaling behaviours. To tackle those issues, we introduce Grouped Spherical Quantization (GSQ), featuring spherical codebook initialization and lookup regularization to constrain cod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02632v2-abstract-full').style.display = 'inline'; document.getElementById('2412.02632v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02632v2-abstract-full" style="display: none;"> Vision tokenizers have gained a lot of attraction due to their scalability and compactness; previous works depend on old-school GAN-based hyperparameters, biased comparisons, and a lack of comprehensive analysis of the scaling behaviours. To tackle those issues, we introduce Grouped Spherical Quantization (GSQ), featuring spherical codebook initialization and lookup regularization to constrain codebook latent to a spherical surface. Our empirical analysis of image tokenizer training strategies demonstrates that GSQ-GAN achieves superior reconstruction quality over state-of-the-art methods with fewer training iterations, providing a solid foundation for scaling studies. Building on this, we systematically examine the scaling behaviours of GSQ, specifically in latent dimensionality, codebook size, and compression ratios, and their impact on model performance. Our findings reveal distinct behaviours at high and low spatial compression levels, underscoring challenges in representing high-dimensional latent spaces. We show that GSQ can restructure high-dimensional latent into compact, low-dimensional spaces, thus enabling efficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x down-sampling with a reconstruction FID (rFID) of 0.50. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02632v2-abstract-full').style.display = 'none'; document.getElementById('2412.02632v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16061">arXiv:2411.16061</a> <span> [<a href="https://arxiv.org/pdf/2411.16061">pdf</a>, <a href="https://arxiv.org/format/2411.16061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2025.3530246">10.1109/TPAMI.2025.3530246 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Scaling Spike-driven Transformer with Efficient Spike Firing Approximation Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+M">Man Yao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xuerui Qiu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiakui Hu</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yuhong Chou</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+K">Keyu Tian</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxing Liao</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+L">Luziwei Leng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16061v1-abstract-short" style="display: inline;"> The ambition of brain-inspired Spiking Neural Networks (SNNs) is to become a low-power alternative to traditional Artificial Neural Networks (ANNs). This work addresses two major challenges in realizing this vision: the performance gap between SNNs and ANNs, and the high training costs of SNNs. We identify intrinsic flaws in spiking neurons caused by binary firing mechanisms and propose a Spike Fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16061v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16061v1-abstract-full" style="display: none;"> The ambition of brain-inspired Spiking Neural Networks (SNNs) is to become a low-power alternative to traditional Artificial Neural Networks (ANNs). This work addresses two major challenges in realizing this vision: the performance gap between SNNs and ANNs, and the high training costs of SNNs. We identify intrinsic flaws in spiking neurons caused by binary firing mechanisms and propose a Spike Firing Approximation (SFA) method using integer training and spike-driven inference. This optimizes the spike firing pattern of spiking neurons, enhancing efficient training, reducing power consumption, improving performance, enabling easier scaling, and better utilizing neuromorphic chips. We also develop an efficient spike-driven Transformer architecture and a spike-masked autoencoder to prevent performance degradation during SNN scaling. On ImageNet-1k, we achieve state-of-the-art top-1 accuracy of 78.5\%, 79.8\%, 84.0\%, and 86.2\% with models containing 10M, 19M, 83M, and 173M parameters, respectively. For instance, the 10M model outperforms the best existing SNN by 7.2\% on ImageNet, with training time acceleration and inference energy efficiency improved by 4.5$\times$ and 3.9$\times$, respectively. We validate the effectiveness and efficiency of the proposed method across various tasks, including object detection, semantic segmentation, and neuromorphic vision tasks. This work enables SNNs to match ANN performance while maintaining the low-power advantage, marking a significant step towards SNNs as a general visual backbone. Code is available at https://github.com/BICLab/Spike-Driven-Transformer-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16061v1-abstract-full').style.display = 'none'; document.getElementById('2411.16061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07132">arXiv:2411.07132</a> <span> [<a href="https://arxiv.org/pdf/2411.07132">pdf</a>, <a href="https://arxiv.org/format/2411.07132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Token Merging for Training-Free Semantic Binding in Text-to-Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Taihang Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linxuan Li</a>, <a href="/search/cs?searchtype=author&query=van+de+Weijer%2C+J">Joost van de Weijer</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+F+S">Fahad Shahbaz Khan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming-Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07132v1-abstract-short" style="display: inline;"> Although text-to-image (T2I) models exhibit remarkable generation capabilities, they frequently fail to accurately bind semantically related objects or attributes in the input prompts; a challenge termed semantic binding. Previous approaches either involve intensive fine-tuning of the entire T2I model or require users or large language models to specify generation layouts, adding complexity. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07132v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07132v1-abstract-full" style="display: none;"> Although text-to-image (T2I) models exhibit remarkable generation capabilities, they frequently fail to accurately bind semantically related objects or attributes in the input prompts; a challenge termed semantic binding. Previous approaches either involve intensive fine-tuning of the entire T2I model or require users or large language models to specify generation layouts, adding complexity. In this paper, we define semantic binding as the task of associating a given object with its attribute, termed attribute binding, or linking it to other related sub-objects, referred to as object binding. We introduce a novel method called Token Merging (ToMe), which enhances semantic binding by aggregating relevant tokens into a single composite token. This ensures that the object, its attributes and sub-objects all share the same cross-attention map. Additionally, to address potential confusion among main objects with complex textual prompts, we propose end token substitution as a complementary strategy. To further refine our approach in the initial stages of T2I generation, where layouts are determined, we incorporate two auxiliary losses, an entropy loss and a semantic binding loss, to iteratively update the composite token to improve the generation integrity. We conducted extensive experiments to validate the effectiveness of ToMe, comparing it against various existing methods on the T2I-CompBench and our proposed GPT-4o object binding benchmark. Our method is particularly effective in complex scenarios that involve multiple objects and attributes, which previous methods often fail to address. The code will be publicly available at \url{https://github.com/hutaihang/ToMe}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07132v1-abstract-full').style.display = 'none'; document.getElementById('2411.07132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Neurips2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19317">arXiv:2410.19317</a> <span> [<a href="https://arxiv.org/pdf/2410.19317">pdf</a>, <a href="https://arxiv.org/format/2410.19317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FairMT-Bench: Benchmarking Fairness for Multi-turn Dialogue in Conversational LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiting Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruizhe Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuozhu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19317v1-abstract-short" style="display: inline;"> The growing use of large language model (LLM)-based chatbots has raised concerns about fairness. Fairness issues in LLMs can lead to severe consequences, such as bias amplification, discrimination, and harm to marginalized communities. While existing fairness benchmarks mainly focus on single-turn dialogues, multi-turn scenarios, which in fact better reflect real-world conversations, present great… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19317v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19317v1-abstract-full" style="display: none;"> The growing use of large language model (LLM)-based chatbots has raised concerns about fairness. Fairness issues in LLMs can lead to severe consequences, such as bias amplification, discrimination, and harm to marginalized communities. While existing fairness benchmarks mainly focus on single-turn dialogues, multi-turn scenarios, which in fact better reflect real-world conversations, present greater challenges due to conversational complexity and potential bias accumulation. In this paper, we propose a comprehensive fairness benchmark for LLMs in multi-turn dialogue scenarios, \textbf{FairMT-Bench}. Specifically, we formulate a task taxonomy targeting LLM fairness capabilities across three stages: context understanding, user interaction, and instruction trade-offs, with each stage comprising two tasks. To ensure coverage of diverse bias types and attributes, we draw from existing fairness datasets and employ our template to construct a multi-turn dialogue dataset, \texttt{FairMT-10K}. For evaluation, GPT-4 is applied, alongside bias classifiers including Llama-Guard-3 and human validation to ensure robustness. Experiments and analyses on \texttt{FairMT-10K} reveal that in multi-turn dialogue scenarios, current LLMs are more likely to generate biased responses, and there is significant variation in performance across different tasks and models. Based on this, we curate a challenging dataset, \texttt{FairMT-1K}, and test 15 current state-of-the-art (SOTA) LLMs on this dataset. The results show the current state of fairness in LLMs and showcase the utility of this novel approach for assessing fairness in more realistic multi-turn dialogue contexts, calling for future work to focus on LLM fairness improvement and the adoption of \texttt{FairMT-1K} in such efforts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19317v1-abstract-full').style.display = 'none'; document.getElementById('2410.19317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16663">arXiv:2410.16663</a> <span> [<a href="https://arxiv.org/pdf/2410.16663">pdf</a>, <a href="https://arxiv.org/format/2410.16663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FastAttention: Extend FlashAttention2 to NPUs and Low-resource GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haoran Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xianzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kang Zhao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Lu Hou</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Zongyuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Kamenev%2C+S">Stanislav Kamenev</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Ting Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingkai Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Q">Qixin Chang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+S">Siyue Sui</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weihao Sun</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiaxin Hu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jun Yao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zekun Yin</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Cheng Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yinfei Pan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiguo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16663v1-abstract-short" style="display: inline;"> FlashAttention series has been widely applied in the inference of large language models (LLMs). However, FlashAttention series only supports the high-level GPU architectures, e.g., Ampere and Hopper. At present, FlashAttention series is not easily transferrable to NPUs and low-resource GPUs. Moreover, FlashAttention series is inefficient for multi- NPUs or GPUs inference scenarios. In this work, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16663v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16663v1-abstract-full" style="display: none;"> FlashAttention series has been widely applied in the inference of large language models (LLMs). However, FlashAttention series only supports the high-level GPU architectures, e.g., Ampere and Hopper. At present, FlashAttention series is not easily transferrable to NPUs and low-resource GPUs. Moreover, FlashAttention series is inefficient for multi- NPUs or GPUs inference scenarios. In this work, we propose FastAttention which pioneers the adaptation of FlashAttention series for NPUs and low-resource GPUs to boost LLM inference efficiency. Specifically, we take Ascend NPUs and Volta-based GPUs as representatives for designing our FastAttention. We migrate FlashAttention series to Ascend NPUs by proposing a novel two-level tiling strategy for runtime speedup, tiling-mask strategy for memory saving and the tiling-AllReduce strategy for reducing communication overhead, respectively. Besides, we adapt FlashAttention for Volta-based GPUs by redesigning the operands layout in shared memory and introducing a simple yet effective CPU-GPU cooperative strategy for efficient memory utilization. On Ascend NPUs, our FastAttention can achieve a 10.7$\times$ speedup compared to the standard attention implementation. Llama-7B within FastAttention reaches up to 5.16$\times$ higher throughput than within the standard attention. On Volta architecture GPUs, FastAttention yields 1.43$\times$ speedup compared to its equivalents in \texttt{xformers}. Pangu-38B within FastAttention brings 1.46$\times$ end-to-end speedup using FasterTransformer. Coupled with the propose CPU-GPU cooperative strategy, FastAttention supports a maximal input length of 256K on 8 V100 GPUs. All the codes will be made available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16663v1-abstract-full').style.display = 'none'; document.getElementById('2410.16663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16418">arXiv:2410.16418</a> <span> [<a href="https://arxiv.org/pdf/2410.16418">pdf</a>, <a href="https://arxiv.org/format/2410.16418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AttentionPainter: An Efficient and Adaptive Stroke Predictor for Scene Painting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yizhe Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Teng Hu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xin Tan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lizhuang Ma</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yu-Kun Lai</a>, <a href="/search/cs?searchtype=author&query=Rosin%2C+P+L">Paul L. Rosin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16418v2-abstract-short" style="display: inline;"> Stroke-based Rendering (SBR) aims to decompose an input image into a sequence of parameterized strokes, which can be rendered into a painting that resembles the input image. Recently, Neural Painting methods that utilize deep learning and reinforcement learning models to predict the stroke sequences have been developed, but suffer from longer inference time or unstable training. To address these i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16418v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16418v2-abstract-full" style="display: none;"> Stroke-based Rendering (SBR) aims to decompose an input image into a sequence of parameterized strokes, which can be rendered into a painting that resembles the input image. Recently, Neural Painting methods that utilize deep learning and reinforcement learning models to predict the stroke sequences have been developed, but suffer from longer inference time or unstable training. To address these issues, we propose AttentionPainter, an efficient and adaptive model for single-step neural painting. First, we propose a novel scalable stroke predictor, which predicts a large number of stroke parameters within a single forward process, instead of the iterative prediction of previous Reinforcement Learning or auto-regressive methods, which makes AttentionPainter faster than previous neural painting methods. To further increase the training efficiency, we propose a Fast Stroke Stacking algorithm, which brings 13 times acceleration for training. Moreover, we propose Stroke-density Loss, which encourages the model to use small strokes for detailed information, to help improve the reconstruction quality. Finally, we propose a new stroke diffusion model for both conditional and unconditional stroke-based generation, which denoises in the stroke parameter space and facilitates stroke-based inpainting and editing applications helpful for human artists design. Extensive experiments show that AttentionPainter outperforms the state-of-the-art neural painting methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16418v2-abstract-full').style.display = 'none'; document.getElementById('2410.16418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16257">arXiv:2410.16257</a> <span> [<a href="https://arxiv.org/pdf/2410.16257">pdf</a>, <a href="https://arxiv.org/format/2410.16257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Elucidating the design space of language models for image generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuantong Liu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyang Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16257v1-abstract-short" style="display: inline;"> The success of autoregressive (AR) language models in text generation has inspired the computer vision community to adopt Large Language Models (LLMs) for image generation. However, considering the essential differences between text and image modalities, the design space of language models for image generation remains underexplored. We observe that image tokens exhibit greater randomness compared… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16257v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16257v1-abstract-full" style="display: none;"> The success of autoregressive (AR) language models in text generation has inspired the computer vision community to adopt Large Language Models (LLMs) for image generation. However, considering the essential differences between text and image modalities, the design space of language models for image generation remains underexplored. We observe that image tokens exhibit greater randomness compared to text tokens, which presents challenges when training with token prediction. Nevertheless, AR models demonstrate their potential by effectively learning patterns even from a seemingly suboptimal optimization problem. Our analysis also reveals that while all models successfully grasp the importance of local information in image generation, smaller models struggle to capture the global context. In contrast, larger models showcase improved capabilities in this area, helping to explain the performance gains achieved when scaling up model size. We further elucidate the design space of language models for vision generation, including tokenizer choice, model choice, model scalability, vocabulary design, and sampling strategy through extensive comparative experiments. Our work is the first to analyze the optimization behavior of language models in vision generation, and we believe it can inspire more effective designs when applying LMs to other domains. Finally, our elucidated language model for image generation, termed as ELM, achieves state-of-the-art performance on the ImageNet 256*256 benchmark. The code is available at https://github.com/Pepperlll/LMforImageGeneration.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16257v1-abstract-full').style.display = 'none'; document.getElementById('2410.16257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://pepper-lll.github.io/LMforImageGeneration/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15332">arXiv:2410.15332</a> <span> [<a href="https://arxiv.org/pdf/2410.15332">pdf</a>, <a href="https://arxiv.org/format/2410.15332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> EPIC: Efficient Position-Independent Context Caching for Serving Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junhao Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenrui Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weidong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tiancheng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qin Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hao Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xusheng Chen</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yizhou Shan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tao Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15332v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) are critical for a wide range of applications, but serving them efficiently becomes increasingly challenging as inputs become more complex. Context caching improves serving performance by exploiting inter-request dependency and reusing key-value (KV) cache across requests, thus improving time-to-first-token (TTFT). However, existing prefix-based context caching require… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15332v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15332v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15332v2-abstract-full" style="display: none;"> Large Language Models (LLMs) are critical for a wide range of applications, but serving them efficiently becomes increasingly challenging as inputs become more complex. Context caching improves serving performance by exploiting inter-request dependency and reusing key-value (KV) cache across requests, thus improving time-to-first-token (TTFT). However, existing prefix-based context caching requires exact token prefix matches, limiting cache reuse in few-shot learning, multi-document QA, or retrieval-augmented generation, where prefixes may vary. In this paper, we present EPIC, an LLM serving system that introduces position-independent context caching (PIC), enabling modular KV cache reuse regardless of token chunk position (or prefix). EPIC features two key designs: AttnLink, which leverages static attention sparsity to minimize recomputation for accuracy recovery, and KVSplit, a customizable chunking method that preserves semantic coherence. Our experiments demonstrate that Epic delivers up to 8x improvements in TTFT and 7x throughput over existing systems, with negligible or no accuracy loss. By addressing the limitations of traditional caching approaches, Epic enables more scalable and efficient LLM inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15332v2-abstract-full').style.display = 'none'; document.getElementById('2410.15332v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14407">arXiv:2410.14407</a> <span> [<a href="https://arxiv.org/pdf/2410.14407">pdf</a>, <a href="https://arxiv.org/format/2410.14407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Formation Control for Enclosing and Tracking via Relative Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xueming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianjiang Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14407v2-abstract-short" style="display: inline;"> This paper proposes an integrated framework for coordinating multiple unmanned aerial vehicles (UAVs) in a distributed manner to persistently enclose and track a moving target without relying on external localization systems. The proposed framework consists of three modules: cooperative state estimators, circular formation pattern generators, and formation tracking controllers. In the cooperative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14407v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14407v2-abstract-full" style="display: none;"> This paper proposes an integrated framework for coordinating multiple unmanned aerial vehicles (UAVs) in a distributed manner to persistently enclose and track a moving target without relying on external localization systems. The proposed framework consists of three modules: cooperative state estimators, circular formation pattern generators, and formation tracking controllers. In the cooperative state estimation module, a recursive least squares estimator (RLSE) for estimating the relative positions between UAVs is integrated with a distributed Kalman filter (DKF), enabling a persistent estimation of the target's state. When a UAV loses direct measurements of the target due to environmental occlusion, measurements from neighbors are aligned into the UAV's local frame to provide indirect measurements. The second module focuses on planning a desired circular formation pattern using a coupled oscillator model. This pattern ensures an even distribution of UAVs around a circle that encloses the moving target. The persistent excitation property of the circular formation is crucial for achieving convergence in the first module. Finally, a consensus-based formation controller is designed to enable multiple UAVs to asymptotically track the planned circular formation pattern while ensuring bounded control inputs. Theoretical analysis demonstrates that the proposed framework ensures asymptotic tracking of a target with constant velocity. For a target with varying velocity, the tracking error converges to a bounded region related to the target's maximum acceleration. Simulations and experiments validate the effectiveness of the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14407v2-abstract-full').style.display = 'none'; document.getElementById('2410.14407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12777">arXiv:2410.12777</a> <span> [<a href="https://arxiv.org/pdf/2410.12777">pdf</a>, <a href="https://arxiv.org/format/2410.12777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Meta-Unlearning on Diffusion Models: Preventing Relearning Unlearned Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+T">Tianyu Pang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+C">Chao Du</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Taihang Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhijie Deng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Min Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12777v1-abstract-short" style="display: inline;"> With the rapid progress of diffusion-based content generation, significant efforts are being made to unlearn harmful or copyrighted concepts from pretrained diffusion models (DMs) to prevent potential model misuse. However, it is observed that even when DMs are properly unlearned before release, malicious finetuning can compromise this process, causing DMs to relearn the unlearned concepts. This o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12777v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12777v1-abstract-full" style="display: none;"> With the rapid progress of diffusion-based content generation, significant efforts are being made to unlearn harmful or copyrighted concepts from pretrained diffusion models (DMs) to prevent potential model misuse. However, it is observed that even when DMs are properly unlearned before release, malicious finetuning can compromise this process, causing DMs to relearn the unlearned concepts. This occurs partly because certain benign concepts (e.g., "skin") retained in DMs are related to the unlearned ones (e.g., "nudity"), facilitating their relearning via finetuning. To address this, we propose meta-unlearning on DMs. Intuitively, a meta-unlearned DM should behave like an unlearned DM when used as is; moreover, if the meta-unlearned DM undergoes malicious finetuning on unlearned concepts, the related benign concepts retained within it will be triggered to self-destruct, hindering the relearning of unlearned concepts. Our meta-unlearning framework is compatible with most existing unlearning methods, requiring only the addition of an easy-to-implement meta objective. We validate our approach through empirical experiments on meta-unlearning concepts from Stable Diffusion models (SD-v1-4 and SDXL), supported by extensive ablation studies. Our code is available at https://github.com/sail-sg/Meta-Unlearning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12777v1-abstract-full').style.display = 'none'; document.getElementById('2410.12777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06101">arXiv:2410.06101</a> <span> [<a href="https://arxiv.org/pdf/2410.06101">pdf</a>, <a href="https://arxiv.org/format/2410.06101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Coevolving with the Other You: Fine-Tuning LLM with Sequential Cooperative Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hao Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyi Hu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Zhiqiang Pu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boyin Liu</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+X">Xiaolin Ai</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yanyan Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06101v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) has emerged as a pivotal technique for fine-tuning large language models (LLMs) on specific tasks. However, prevailing RL fine-tuning methods predominantly rely on PPO and its variants. Though these algorithms are effective in general RL settings, they often exhibit suboptimal performance and vulnerability to distribution collapse when applied to the fine-tuning of LLMs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06101v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06101v1-abstract-full" style="display: none;"> Reinforcement learning (RL) has emerged as a pivotal technique for fine-tuning large language models (LLMs) on specific tasks. However, prevailing RL fine-tuning methods predominantly rely on PPO and its variants. Though these algorithms are effective in general RL settings, they often exhibit suboptimal performance and vulnerability to distribution collapse when applied to the fine-tuning of LLMs. In this paper, we propose CORY, extending the RL fine-tuning of LLMs to a sequential cooperative multi-agent reinforcement learning framework, to leverage the inherent coevolution and emergent capabilities of multi-agent systems. In CORY, the LLM to be fine-tuned is initially duplicated into two autonomous agents: a pioneer and an observer. The pioneer generates responses based on queries, while the observer generates responses using both the queries and the pioneer's responses. The two agents are trained together. During training, the agents exchange roles periodically, fostering cooperation and coevolution between them. Experiments evaluate CORY's performance by fine-tuning GPT-2 and Llama-2 under subjective and objective reward functions on the IMDB Review and GSM8K datasets, respectively. Results show that CORY outperforms PPO in terms of policy optimality, resistance to distribution collapse, and training robustness, thereby underscoring its potential as a superior methodology for refining LLMs in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06101v1-abstract-full').style.display = 'none'; document.getElementById('2410.06101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 26 images</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02631">arXiv:2410.02631</a> <span> [<a href="https://arxiv.org/pdf/2410.02631">pdf</a>, <a href="https://arxiv.org/format/2410.02631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model for Multi-Domain Translation: Benchmarking and Domain CoT Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jun Xie</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+D+F">Derek F. Wong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02631v1-abstract-short" style="display: inline;"> Achieving consistent high-quality machine translation (MT) across diverse domains remains a significant challenge, primarily due to the limited and imbalanced parallel training data available in various domains. While large language models (LLMs) have demonstrated impressive general understanding and generation abilities, their potential in multi-domain MT is under-explored. We establish a compreh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02631v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02631v1-abstract-full" style="display: none;"> Achieving consistent high-quality machine translation (MT) across diverse domains remains a significant challenge, primarily due to the limited and imbalanced parallel training data available in various domains. While large language models (LLMs) have demonstrated impressive general understanding and generation abilities, their potential in multi-domain MT is under-explored. We establish a comprehensive benchmark for multi-domain translation, featuring 25 German$\Leftrightarrow$English and 22 Chinese$\Leftrightarrow$English test sets respectively covering 15 domains. Our evaluation of prominent LLMs reveals a discernible performance gap against traditional MT systems, highlighting domain overfitting and catastrophic forgetting issues after fine-tuning on domain-limited corpora. To mitigate this, we propose a domain Chain of Thought (CoT) fine-tuning technique that utilizes the intrinsic multi-domain intelligence of LLMs to improve translation performance. This method inspires the LLM to perceive domain information from the source text, which then serves as a helpful hint to guide the translation process. Despite being trained on a small dataset of four domains, our CoT fine-tune approach achieves notable enhancements in translation accuracy and domain robustness than traditional fine-tuning, as evidenced by an average 1.53 BLEU score increase in over 20 German$\rightarrow$English distinct out-of-domain tests. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02631v1-abstract-full').style.display = 'none'; document.getElementById('2410.02631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01696">arXiv:2410.01696</a> <span> [<a href="https://arxiv.org/pdf/2410.01696">pdf</a>, <a href="https://arxiv.org/format/2410.01696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CreDes: Causal Reasoning Enhancement and Dual-End Searching for Solving Long-Range Reasoning Problems using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kangsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Songde Han</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Huimin Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01696v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated limitations in handling combinatorial optimization problems involving long-range reasoning, partially due to causal hallucinations and huge search space. As for causal hallucinations, i.e., the inconsistency between reasoning and corresponding state transition, this paper introduces the Causal Relationship Enhancement (CRE) mechanism combining cause-e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01696v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01696v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated limitations in handling combinatorial optimization problems involving long-range reasoning, partially due to causal hallucinations and huge search space. As for causal hallucinations, i.e., the inconsistency between reasoning and corresponding state transition, this paper introduces the Causal Relationship Enhancement (CRE) mechanism combining cause-effect interventions and the Individual Treatment Effect (ITE) to guarantee the solid causal rightness between each step of reasoning and state transition. As for the long causal range and huge search space limiting the performances of existing models featuring single-direction search, a Dual-End Searching (DES) approach is proposed to seek solutions by simultaneously starting from both the initial and goal states on the causal probability tree. By integrating CRE and DES (CreDes), our model has realized simultaneous multi-step reasoning, circumventing the inefficiencies from cascading multiple one-step reasoning like the Chain-of-Thought (CoT). Experiments demonstrate that CreDes significantly outperforms existing State-Of-The-Art (SOTA) solutions in long-range reasoning tasks in terms of both accuracy and time efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01696v1-abstract-full').style.display = 'none'; document.getElementById('2410.01696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20548">arXiv:2409.20548</a> <span> [<a href="https://arxiv.org/pdf/2409.20548">pdf</a>, <a href="https://arxiv.org/format/2409.20548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Robi Butler: Remote Multimodal Interactions with Household Robot Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Anxing Xiao</a>, <a href="/search/cs?searchtype=author&query=Janaka%2C+N">Nuwan Janaka</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianrun Hu</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Anshul Gupta</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaixin Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Cunjun Yu</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+D">David Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20548v1-abstract-short" style="display: inline;"> In this paper, we introduce Robi Butler, a novel household robotic system that enables multimodal interactions with remote users. Building on the advanced communication interfaces, Robi Butler allows users to monitor the robot's status, send text or voice instructions, and select target objects by hand pointing. At the core of our system is a high-level behavior module, powered by Large Language M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20548v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20548v1-abstract-full" style="display: none;"> In this paper, we introduce Robi Butler, a novel household robotic system that enables multimodal interactions with remote users. Building on the advanced communication interfaces, Robi Butler allows users to monitor the robot's status, send text or voice instructions, and select target objects by hand pointing. At the core of our system is a high-level behavior module, powered by Large Language Models (LLMs), that interprets multimodal instructions to generate action plans. These plans are composed of a set of open vocabulary primitives supported by Vision Language Models (VLMs) that handle both text and pointing queries. The integration of the above components allows Robi Butler to ground remote multimodal instructions in the real-world home environment in a zero-shot manner. We demonstrate the effectiveness and efficiency of this system using a variety of daily household tasks that involve remote users giving multimodal instructions. Additionally, we conducted a user study to analyze how multimodal interactions affect efficiency and user experience during remote human-robot interaction and discuss the potential improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20548v1-abstract-full').style.display = 'none'; document.getElementById('2409.20548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18558">arXiv:2409.18558</a> <span> [<a href="https://arxiv.org/pdf/2409.18558">pdf</a>, <a href="https://arxiv.org/format/2409.18558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> XWSB: A Blend System Utilizing XLS-R and WavLM with SLS Classifier detection system for SVDD 2024 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+S">Shuangbing Wen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F">Fangke Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18558v1-abstract-short" style="display: inline;"> This paper introduces the model structure used in the SVDD 2024 Challenge. The SVDD 2024 challenge has been introduced this year for the first time. Singing voice deepfake detection (SVDD) which faces complexities due to informal speech intonations and varying speech rates. In this paper, we propose the XWSB system, which achieved SOTA per-formance in the SVDD challenge. XWSB stands for XLS-R, Wav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18558v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18558v1-abstract-full" style="display: none;"> This paper introduces the model structure used in the SVDD 2024 Challenge. The SVDD 2024 challenge has been introduced this year for the first time. Singing voice deepfake detection (SVDD) which faces complexities due to informal speech intonations and varying speech rates. In this paper, we propose the XWSB system, which achieved SOTA per-formance in the SVDD challenge. XWSB stands for XLS-R, WavLM, and SLS Blend, representing the integration of these technologies for the purpose of SVDD. Specifically, we used the best performing model structure XLS-R&SLS from the ASVspoof DF dataset, and applied SLS to WavLM to form the WavLM&SLS structure. Finally, we integrated two models to form the XWSB system. Experimental results show that our system demonstrates advanced recognition capabilities in the SVDD challenge, specifically achieving an EER of 2.32% in the CtrSVDD track. The code and data can be found at https://github.com/QiShanZhang/XWSB_for_ SVDD2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18558v1-abstract-full').style.display = 'none'; document.getElementById('2409.18558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Spoken Language Technology Workshop 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17174">arXiv:2409.17174</a> <span> [<a href="https://arxiv.org/pdf/2409.17174">pdf</a>, <a href="https://arxiv.org/format/2409.17174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CSCE: Boosting LLM Reasoning by Simultaneous Enhancing of Casual Significance and Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kangsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zizheng Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyu Hu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Huimin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17174v1-abstract-short" style="display: inline;"> Chain-based reasoning methods like chain of thought (CoT) play a rising role in solving reasoning tasks for large language models (LLMs). However, the causal illusions between \textit{a step of reasoning} and \textit{corresponding state transitions} are becoming a significant obstacle to advancing LLMs' reasoning capabilities, especially in long-range reasoning tasks. This paper proposes a non-cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17174v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17174v1-abstract-full" style="display: none;"> Chain-based reasoning methods like chain of thought (CoT) play a rising role in solving reasoning tasks for large language models (LLMs). However, the causal illusions between \textit{a step of reasoning} and \textit{corresponding state transitions} are becoming a significant obstacle to advancing LLMs' reasoning capabilities, especially in long-range reasoning tasks. This paper proposes a non-chain-based reasoning framework for simultaneous consideration of causal significance and consistency, i.e., the Causal Significance and Consistency Enhancer (CSCE). We customize LLM's loss function utilizing treatment effect assessments to enhance its reasoning ability from two aspects: causal significance and consistency. This ensures that the model captures essential causal relationships and maintains robust and consistent performance across various scenarios. Additionally, we transform the reasoning process from the cascading multiple one-step reasoning commonly used in Chain-Based methods, like CoT, to a causal-enhanced method that outputs the entire reasoning process in one go, further improving the model's reasoning efficiency. Extensive experiments show that our method improves both the reasoning success rate and speed. These improvements further demonstrate that non-chain-based methods can also aid LLMs in completing reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17174v1-abstract-full').style.display = 'none'; document.getElementById('2409.17174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16626">arXiv:2409.16626</a> <span> [<a href="https://arxiv.org/pdf/2409.16626">pdf</a>, <a href="https://arxiv.org/format/2409.16626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Ascend HiFloat8 Format for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuanyong Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Richard Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hu Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Ying Jin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kai Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minmin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhanying He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guipeng Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Luyao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianchi Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junsong Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minqi Chen</a>, <a href="/search/cs?searchtype=author&query=Dmitry%2C+M">Mikhaylov Dmitry</a>, <a href="/search/cs?searchtype=author&query=Vladimir%2C+K">Korviakov Vladimir</a>, <a href="/search/cs?searchtype=author&query=Maxim%2C+B">Bobrin Maxim</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuhao Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanfu Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zeyi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16626v2-abstract-short" style="display: inline;"> This preliminary white paper proposes a novel 8-bit floating-point data format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered precision. For normal value encoding, it provides 7 exponent values with 3-bit mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with 1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 extra powers o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16626v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16626v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16626v2-abstract-full" style="display: none;"> This preliminary white paper proposes a novel 8-bit floating-point data format HiFloat8 (abbreviated as HiF8) for deep learning. HiF8 features tapered precision. For normal value encoding, it provides 7 exponent values with 3-bit mantissa, 8 exponent values with 2-bit mantissa, and 16 exponent values with 1-bit mantissa. For denormal value encoding, it extends the dynamic range by 7 extra powers of 2, from 31 to 38 binades (notice that FP16 covers 40 binades). Meanwhile, HiF8 encodes all the special values except that positive zero and negative zero are represented by only one bit-pattern. Thanks to the better balance between precision and dynamic range, HiF8 can be simultaneously used in both forward and backward passes of AI training. In this paper, we will describe the definition and rounding methods of HiF8, as well as the tentative training and inference solutions. To demonstrate the efficacy of HiF8, massive simulation results on various neural networks, including traditional neural networks and large language models (LLMs), will also be presented. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16626v2-abstract-full').style.display = 'none'; document.getElementById('2409.16626v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 Pages, 4 Figures, 9 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13244">arXiv:2409.13244</a> <span> [<a href="https://arxiv.org/pdf/2409.13244">pdf</a>, <a href="https://arxiv.org/format/2409.13244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From Cognition to Precognition: A Future-Aware Framework for Social Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zeying Gong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianshuai Hu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R">Ronghe Qiu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Junwei Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13244v2-abstract-short" style="display: inline;"> To navigate safely and efficiently in crowded spaces, robots should not only perceive the current state of the environment but also anticipate future human movements. In this paper, we propose a reinforcement learning architecture, namely Falcon, to tackle socially-aware navigation by explicitly predicting human trajectories and penalizing actions that block future human paths. To facilitate reali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13244v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13244v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13244v2-abstract-full" style="display: none;"> To navigate safely and efficiently in crowded spaces, robots should not only perceive the current state of the environment but also anticipate future human movements. In this paper, we propose a reinforcement learning architecture, namely Falcon, to tackle socially-aware navigation by explicitly predicting human trajectories and penalizing actions that block future human paths. To facilitate realistic evaluation, we introduce a novel SocialNav benchmark containing two new datasets, Social-HM3D and Social-MP3D. This benchmark offers large-scale photo-realistic indoor scenes populated with a reasonable amount of human agents based on scene area size, incorporating natural human movements and trajectory patterns. We conduct a detailed experimental analysis with the state-of-the-art learning-based method and two classic rule-based path-planning algorithms on the new benchmark. The results demonstrate the importance of future prediction and our method achieves the best task success rate of 55% while maintaining about 90% personal space compliance. We will release our code and datasets. Videos of demonstrations can be viewed at https://zeying-gong.github.io/projects/falcon/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13244v2-abstract-full').style.display = 'none'; document.getElementById('2409.13244v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Social Navigation; Trajectory Prediction; Auxiliary Tasks. This paper has been accepted at the IEEE International Conference on Robotics and Automation (ICRA) 2025. For more details, please refer to the project website: https://zeying-gong.github.io/projects/falcon/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06952">arXiv:2409.06952</a> <span> [<a href="https://arxiv.org/pdf/2409.06952">pdf</a>, <a href="https://arxiv.org/format/2409.06952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Flow-Inspired Lightweight Multi-Robot Real-Time Scheduling Planner </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Han Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yu Jin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianjiang Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kai Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06952v1-abstract-short" style="display: inline;"> Collision avoidance and trajectory planning are crucial in multi-robot systems, particularly in environments with numerous obstacles. Although extensive research has been conducted in this field, the challenge of rapid traversal through such environments has not been fully addressed. This paper addresses this problem by proposing a novel real-time scheduling scheme designed to optimize the passage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06952v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06952v1-abstract-full" style="display: none;"> Collision avoidance and trajectory planning are crucial in multi-robot systems, particularly in environments with numerous obstacles. Although extensive research has been conducted in this field, the challenge of rapid traversal through such environments has not been fully addressed. This paper addresses this problem by proposing a novel real-time scheduling scheme designed to optimize the passage of multi-robot systems through complex, obstacle-rich maps. Inspired from network flow optimization, our scheme decomposes the environment into a network structure, enabling the efficient allocation of robots to paths based on real-time congestion data. The proposed scheduling planner operates on top of existing collision avoidance algorithms, focusing on minimizing traversal time by balancing robot detours and waiting times. Our simulation results demonstrate the efficiency of the proposed scheme. Additionally, we validated its effectiveness through real world flight tests using ten quadrotors. This work contributes a lightweight, effective scheduling planner capable of meeting the real-time demands of multi-robot systems in obstacle-rich environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06952v1-abstract-full').style.display = 'none'; document.getElementById('2409.06952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06633">arXiv:2409.06633</a> <span> [<a href="https://arxiv.org/pdf/2409.06633">pdf</a>, <a href="https://arxiv.org/format/2409.06633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SaRA: High-Efficient Diffusion Model Fine-tuning with Progressive Sparse Low-Rank Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Teng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongrui Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yabiao Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lizhuang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06633v1-abstract-short" style="display: inline;"> In recent years, the development of diffusion models has led to significant progress in image and video generation tasks, with pre-trained models like the Stable Diffusion series playing a crucial role. Inspired by model pruning which lightens large pre-trained models by removing unimportant parameters, we propose a novel model fine-tuning method to make full use of these ineffective parameters an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06633v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06633v1-abstract-full" style="display: none;"> In recent years, the development of diffusion models has led to significant progress in image and video generation tasks, with pre-trained models like the Stable Diffusion series playing a crucial role. Inspired by model pruning which lightens large pre-trained models by removing unimportant parameters, we propose a novel model fine-tuning method to make full use of these ineffective parameters and enable the pre-trained model with new task-specified capabilities. In this work, we first investigate the importance of parameters in pre-trained diffusion models, and discover that the smallest 10% to 20% of parameters by absolute values do not contribute to the generation process. Based on this observation, we propose a method termed SaRA that re-utilizes these temporarily ineffective parameters, equating to optimizing a sparse weight matrix to learn the task-specific knowledge. To mitigate overfitting, we propose a nuclear-norm-based low-rank sparse training scheme for efficient fine-tuning. Furthermore, we design a new progressive parameter adjustment strategy to make full use of the re-trained/finetuned parameters. Finally, we propose a novel unstructural backpropagation strategy, which significantly reduces memory costs during fine-tuning. Our method enhances the generative capabilities of pre-trained models in downstream applications and outperforms traditional fine-tuning methods like LoRA in maintaining model's generalization ability. We validate our approach through fine-tuning experiments on SD models, demonstrating significant improvements. SaRA also offers a practical advantage that requires only a single line of code modification for efficient implementation and is seamlessly compatible with existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06633v1-abstract-full').style.display = 'none'; document.getElementById('2409.06633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Parameter efficient finetuning method</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05379">arXiv:2409.05379</a> <span> [<a href="https://arxiv.org/pdf/2409.05379">pdf</a>, <a href="https://arxiv.org/format/2409.05379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PersonaTalk: Bring Attention to Your Persona in Visual Dubbing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shuang Liang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zhipeng Ge</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianshu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05379v1-abstract-short" style="display: inline;"> For audio-driven visual dubbing, it remains a considerable challenge to uphold and highlight speaker's persona while synthesizing accurate lip synchronization. Existing methods fall short of capturing speaker's unique speaking style or preserving facial details. In this paper, we present PersonaTalk, an attention-based two-stage framework, including geometry construction and face rendering, for hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05379v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05379v1-abstract-full" style="display: none;"> For audio-driven visual dubbing, it remains a considerable challenge to uphold and highlight speaker's persona while synthesizing accurate lip synchronization. Existing methods fall short of capturing speaker's unique speaking style or preserving facial details. In this paper, we present PersonaTalk, an attention-based two-stage framework, including geometry construction and face rendering, for high-fidelity and personalized visual dubbing. In the first stage, we propose a style-aware audio encoding module that injects speaking style into audio features through a cross-attention layer. The stylized audio features are then used to drive speaker's template geometry to obtain lip-synced geometries. In the second stage, a dual-attention face renderer is introduced to render textures for the target geometries. It consists of two parallel cross-attention layers, namely Lip-Attention and Face-Attention, which respectively sample textures from different reference frames to render the entire face. With our innovative design, intricate facial details can be well preserved. Comprehensive experiments and user studies demonstrate our advantages over other state-of-the-art methods in terms of visual quality, lip-sync accuracy and persona preservation. Furthermore, as a person-generic framework, PersonaTalk can achieve competitive performance as state-of-the-art person-specific methods. Project Page: https://grisoon.github.io/PersonaTalk/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05379v1-abstract-full').style.display = 'none'; document.getElementById('2409.05379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at SIGGRAPH Asia 2024 (Conference Track)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04003">arXiv:2409.04003</a> <span> [<a href="https://arxiv.org/pdf/2409.04003">pdf</a>, <a href="https://arxiv.org/format/2409.04003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamForge: Motion-Aware Autoregressive Video Generation for Multi-View Driving Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jianbiao Mei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuemeng Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Licheng Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tiantian Wei</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yukai Ma</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+M">Min Dou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04003v2-abstract-short" style="display: inline;"> Recent advances in diffusion models have improved controllable streetscape generation and supported downstream perception and planning tasks. However, challenges remain in accurately modeling driving scenes and generating long videos. To alleviate these issues, we propose DreamForge, an advanced diffusion-based autoregressive video generation model tailored for 3D-controllable long-term generation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04003v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04003v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04003v2-abstract-full" style="display: none;"> Recent advances in diffusion models have improved controllable streetscape generation and supported downstream perception and planning tasks. However, challenges remain in accurately modeling driving scenes and generating long videos. To alleviate these issues, we propose DreamForge, an advanced diffusion-based autoregressive video generation model tailored for 3D-controllable long-term generation. To enhance the lane and foreground generation, we introduce perspective guidance and integrate object-wise position encoding to incorporate local 3D correlation and improve foreground object modeling. We also propose motion-aware temporal attention to capture motion cues and appearance changes in videos. By leveraging motion frames and an autoregressive generation paradigm, we can autoregressively generate long videos (over 200 frames) using a 7-frame model, achieving superior quality compared to the baseline in 16-frame video evaluations. Finally, we integrate our method with the realistic simulation platform DriveArena to provide more reliable open-loop and closed-loop evaluations for vision-based driving agents. The project page is available at https://pjlab-adg.github.io/DriveArena/dreamforge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04003v2-abstract-full').style.display = 'none'; document.getElementById('2409.04003v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02375">arXiv:2409.02375</a> <span> [<a href="https://arxiv.org/pdf/2409.02375">pdf</a>, <a href="https://arxiv.org/format/2409.02375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How Privacy-Savvy Are Large Language Models? A Case Study on Compliance and Privacy Technical Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xichou Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhou Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujun Chen</a>, <a href="/search/cs?searchtype=author&query=John%2C+B">Benzi John</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhenzhen Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bolong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Manman Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zongxing Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Dan Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02375v4-abstract-short" style="display: inline;"> The recent advances in large language models (LLMs) have significantly expanded their applications across various fields such as language generation, summarization, and complex question answering. However, their application to privacy compliance and technical privacy reviews remains under-explored, raising critical concerns about their ability to adhere to global privacy standards and protect sens… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02375v4-abstract-full').style.display = 'inline'; document.getElementById('2409.02375v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02375v4-abstract-full" style="display: none;"> The recent advances in large language models (LLMs) have significantly expanded their applications across various fields such as language generation, summarization, and complex question answering. However, their application to privacy compliance and technical privacy reviews remains under-explored, raising critical concerns about their ability to adhere to global privacy standards and protect sensitive user data. This paper seeks to address this gap by providing a comprehensive case study evaluating LLMs' performance in privacy-related tasks such as privacy information extraction (PIE), legal and regulatory key point detection (KPD), and question answering (QA) with respect to privacy policies and data protection regulations. We introduce a Privacy Technical Review (PTR) framework, highlighting its role in mitigating privacy risks during the software development life-cycle. Through an empirical assessment, we investigate the capacity of several prominent LLMs, including BERT, GPT-3.5, GPT-4, and custom models, in executing privacy compliance checks and technical privacy reviews. Our experiments benchmark the models across multiple dimensions, focusing on their precision, recall, and F1-scores in extracting privacy-sensitive information and detecting key regulatory compliance points. While LLMs show promise in automating privacy reviews and identifying regulatory discrepancies, significant gaps persist in their ability to fully comply with evolving legal standards. We provide actionable recommendations for enhancing LLMs' capabilities in privacy compliance, emphasizing the need for robust model improvements and better integration with legal and regulatory requirements. This study underscores the growing importance of developing privacy-aware LLMs that can both support businesses in compliance efforts and safeguard user privacy rights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02375v4-abstract-full').style.display = 'none'; document.getElementById('2409.02375v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02370">arXiv:2409.02370</a> <span> [<a href="https://arxiv.org/pdf/2409.02370">pdf</a>, <a href="https://arxiv.org/format/2409.02370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Do Large Language Models Possess Sensitive to Sentiment? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xichou Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhou Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujun Chen</a>, <a href="/search/cs?searchtype=author&query=John%2C+B">Benzi John</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhenzhen Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02370v4-abstract-short" style="display: inline;"> Large Language Models (LLMs) have recently displayed their extraordinary capabilities in language understanding. However, how to comprehensively assess the sentiment capabilities of LLMs continues to be a challenge. This paper investigates the ability of LLMs to detect and react to sentiment in text modal. As the integration of LLMs into diverse applications is on the rise, it becomes highly criti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02370v4-abstract-full').style.display = 'inline'; document.getElementById('2409.02370v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02370v4-abstract-full" style="display: none;"> Large Language Models (LLMs) have recently displayed their extraordinary capabilities in language understanding. However, how to comprehensively assess the sentiment capabilities of LLMs continues to be a challenge. This paper investigates the ability of LLMs to detect and react to sentiment in text modal. As the integration of LLMs into diverse applications is on the rise, it becomes highly critical to comprehend their sensitivity to emotional tone, as it can influence the user experience and the efficacy of sentiment-driven tasks. We conduct a series of experiments to evaluate the performance of several prominent LLMs in identifying and responding appropriately to sentiments like positive, negative, and neutral emotions. The models' outputs are analyzed across various sentiment benchmarks, and their responses are compared with human evaluations. Our discoveries indicate that although LLMs show a basic sensitivity to sentiment, there are substantial variations in their accuracy and consistency, emphasizing the requirement for further enhancements in their training processes to better capture subtle emotional cues. Take an example in our findings, in some cases, the models might wrongly classify a strongly positive sentiment as neutral, or fail to recognize sarcasm or irony in the text. Such misclassifications highlight the complexity of sentiment analysis and the areas where the models need to be refined. Another aspect is that different LLMs might perform differently on the same set of data, depending on their architecture and training datasets. This variance calls for a more in-depth study of the factors that contribute to the performance differences and how they can be optimized. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02370v4-abstract-full').style.display = 'none'; document.getElementById('2409.02370v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+T&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository