Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 435 results for author: <span class="mathjax">Jia, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Jia%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Jia, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Jia%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Jia, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jia%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17223">arXiv:2411.17223</a> <span> [<a href="https://arxiv.org/pdf/2411.17223">pdf</a>, <a href="https://arxiv.org/format/2411.17223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamMix: Decoupling Object Attributes for Enhanced Editability in Customized Image Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yicheng Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengxiang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Liqian Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Ping Hu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S">Siyu Du</a>, <a href="/search/cs?searchtype=author&query=Zhuge%2C+Y">Yunzhi Zhuge</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17223v1-abstract-short" style="display: inline;"> Subject-driven image inpainting has emerged as a popular task in image editing alongside recent advancements in diffusion models. Previous methods primarily focus on identity preservation but struggle to maintain the editability of inserted objects. In response, this paper introduces DreamMix, a diffusion-based generative model adept at inserting target objects into given scenes at user-specified… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17223v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17223v1-abstract-full" style="display: none;"> Subject-driven image inpainting has emerged as a popular task in image editing alongside recent advancements in diffusion models. Previous methods primarily focus on identity preservation but struggle to maintain the editability of inserted objects. In response, this paper introduces DreamMix, a diffusion-based generative model adept at inserting target objects into given scenes at user-specified locations while concurrently enabling arbitrary text-driven modifications to their attributes. In particular, we leverage advanced foundational inpainting models and introduce a disentangled local-global inpainting framework to balance precise local object insertion with effective global visual coherence. Additionally, we propose an Attribute Decoupling Mechanism (ADM) and a Textual Attribute Substitution (TAS) module to improve the diversity and discriminative capability of the text-based attribute guidance, respectively. Extensive experiments demonstrate that DreamMix effectively balances identity preservation and attribute editability across various application scenarios, including object insertion, attribute editing, and small object inpainting. Our code is publicly available at https://github.com/mycfhs/DreamMix. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17223v1-abstract-full').style.display = 'none'; document.getElementById('2411.17223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16238">arXiv:2411.16238</a> <span> [<a href="https://arxiv.org/pdf/2411.16238">pdf</a>, <a href="https://arxiv.org/format/2411.16238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> UVLLM: An Automated Universal RTL Verification Framework using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuchen Hu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junhao Ye</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jialin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xinyao Jiao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+D">Dingrong Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+W">Weiwei Shan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinwei Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+N">Nan Guan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16238v1-abstract-short" style="display: inline;"> Verifying hardware designs in embedded systems is crucial but often labor-intensive and time-consuming. While existing solutions have improved automation, they frequently rely on unrealistic assumptions. To address these challenges, we introduce a novel framework, UVLLM, which combines Large Language Models (LLMs) with the Universal Verification Methodology (UVM) to relax these assumptions. UVLLM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16238v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16238v1-abstract-full" style="display: none;"> Verifying hardware designs in embedded systems is crucial but often labor-intensive and time-consuming. While existing solutions have improved automation, they frequently rely on unrealistic assumptions. To address these challenges, we introduce a novel framework, UVLLM, which combines Large Language Models (LLMs) with the Universal Verification Methodology (UVM) to relax these assumptions. UVLLM significantly enhances the automation of testing and repairing error-prone Register Transfer Level (RTL) codes, a critical aspect of verification development. Unlike existing methods, UVLLM ensures that all errors are triggered during verification, achieving a syntax error fix rate of 86.99% and a functional error fix rate of 71.92% on our proposed benchmark. These results demonstrate a substantial improvement in verification efficiency. Additionally, our study highlights the current limitations of LLM applications, particularly their reliance on extensive training data. We emphasize the transformative potential of LLMs in hardware design verification and suggest promising directions for future research in AI-driven hardware design methodologies. The Repo. of dataset and code: https://anonymous.4open.science/r/UVLLM/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16238v1-abstract-full').style.display = 'none'; document.getElementById('2411.16238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15729">arXiv:2411.15729</a> <span> [<a href="https://arxiv.org/pdf/2411.15729">pdf</a>, <a href="https://arxiv.org/format/2411.15729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OccludeNet: A Causal Journey into Mixed-View Actor-Centric Video Action Recognition under Occlusions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guanyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenxin Huang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuemei Jia</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chia-Wen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15729v1-abstract-short" style="display: inline;"> The lack of occlusion data in commonly used action recognition video datasets limits model robustness and impedes sustained performance improvements. We construct OccludeNet, a large-scale occluded video dataset that includes both real-world and synthetic occlusion scene videos under various natural environments. OccludeNet features dynamic tracking occlusion, static scene occlusion, and multi-vie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15729v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15729v1-abstract-full" style="display: none;"> The lack of occlusion data in commonly used action recognition video datasets limits model robustness and impedes sustained performance improvements. We construct OccludeNet, a large-scale occluded video dataset that includes both real-world and synthetic occlusion scene videos under various natural environments. OccludeNet features dynamic tracking occlusion, static scene occlusion, and multi-view interactive occlusion, addressing existing gaps in data. Our analysis reveals that occlusion impacts action classes differently, with actions involving low scene relevance and partial body visibility experiencing greater accuracy degradation. To overcome the limitations of current occlusion-focused approaches, we propose a structural causal model for occluded scenes and introduce the Causal Action Recognition (CAR) framework, which employs backdoor adjustment and counterfactual reasoning. This framework enhances key actor information, improving model robustness to occlusion. We anticipate that the challenges posed by OccludeNet will stimulate further exploration of causal relations in occlusion scenarios and encourage a reevaluation of class correlations, ultimately promoting sustainable performance improvements. The code and full dataset will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15729v1-abstract-full').style.display = 'none'; document.getElementById('2411.15729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15706">arXiv:2411.15706</a> <span> [<a href="https://arxiv.org/pdf/2411.15706">pdf</a>, <a href="https://arxiv.org/format/2411.15706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fixing the Perspective: A Critical Examination of Zero-1-to-3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jack Yu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xueying Jia</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Charlie Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Prince Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15706v1-abstract-short" style="display: inline;"> Novel view synthesis is a fundamental challenge in image-to-3D generation, requiring the generation of target view images from a set of conditioning images and their relative poses. While recent approaches like Zero-1-to-3 have demonstrated promising results using conditional latent diffusion models, they face significant challenges in generating consistent and accurate novel views, particularly w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15706v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15706v1-abstract-full" style="display: none;"> Novel view synthesis is a fundamental challenge in image-to-3D generation, requiring the generation of target view images from a set of conditioning images and their relative poses. While recent approaches like Zero-1-to-3 have demonstrated promising results using conditional latent diffusion models, they face significant challenges in generating consistent and accurate novel views, particularly when handling multiple conditioning images. In this work, we conduct a thorough investigation of Zero-1-to-3's cross-attention mechanism within the Spatial Transformer of the diffusion 2D-conditional UNet. Our analysis reveals a critical discrepancy between Zero-1-to-3's theoretical framework and its implementation, specifically in the processing of image-conditional context. We propose two significant improvements: (1) a corrected implementation that enables effective utilization of the cross-attention mechanism, and (2) an enhanced architecture that can leverage multiple conditional views simultaneously. Our theoretical analysis and preliminary results suggest potential improvements in novel view synthesis consistency and accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15706v1-abstract-full').style.display = 'none'; document.getElementById('2411.15706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14502">arXiv:2411.14502</a> <span> [<a href="https://arxiv.org/pdf/2411.14502">pdf</a>, <a href="https://arxiv.org/format/2411.14502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Global Challenge for Safe and Secure LLMs Track 1 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihao Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P+Y">Peng Yan Tan</a>, <a href="/search/cs?searchtype=author&query=Yau%2C+W+K">Weng Kuan Yau</a>, <a href="/search/cs?searchtype=author&query=Mak%2C+M">Mun-Thye Mak</a>, <a href="/search/cs?searchtype=author&query=Sim%2C+X+M">Xin Ming Sim</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+W+S">Wee Siong Ng</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S+K">See Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huanqian Yan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaobing Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Long Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yiming Qian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+L">Leqi Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Renmiao Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yida Lu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shiyao Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shaohua Li</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14502v1-abstract-short" style="display: inline;"> This paper introduces the Global Challenge for Safe and Secure Large Language Models (LLMs), a pioneering initiative organized by AI Singapore (AISG) and the CyberSG R&D Programme Office (CRPO) to foster the development of advanced defense mechanisms against automated jailbreaking attacks. With the increasing integration of LLMs in critical sectors such as healthcare, finance, and public administr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14502v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14502v1-abstract-full" style="display: none;"> This paper introduces the Global Challenge for Safe and Secure Large Language Models (LLMs), a pioneering initiative organized by AI Singapore (AISG) and the CyberSG R&D Programme Office (CRPO) to foster the development of advanced defense mechanisms against automated jailbreaking attacks. With the increasing integration of LLMs in critical sectors such as healthcare, finance, and public administration, ensuring these models are resilient to adversarial attacks is vital for preventing misuse and upholding ethical standards. This competition focused on two distinct tracks designed to evaluate and enhance the robustness of LLM security frameworks. Track 1 tasked participants with developing automated methods to probe LLM vulnerabilities by eliciting undesirable responses, effectively testing the limits of existing safety protocols within LLMs. Participants were challenged to devise techniques that could bypass content safeguards across a diverse array of scenarios, from offensive language to misinformation and illegal activities. Through this process, Track 1 aimed to deepen the understanding of LLM vulnerabilities and provide insights for creating more resilient models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14502v1-abstract-full').style.display = 'none'; document.getElementById('2411.14502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13979">arXiv:2411.13979</a> <span> [<a href="https://arxiv.org/pdf/2411.13979">pdf</a>, <a href="https://arxiv.org/format/2411.13979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FedRAV: Hierarchically Federated Region-Learning for Traffic Object Classification of Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yijun Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pengzhan Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuepeng He</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+F">Fang Qu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhida Qin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xianlong Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guiyan Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Songtao Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13979v1-abstract-short" style="display: inline;"> The emerging federated learning enables distributed autonomous vehicles to train equipped deep learning models collaboratively without exposing their raw data, providing great potential for utilizing explosively growing autonomous driving data. However, considering the complicated traffic environments and driving scenarios, deploying federated learning for autonomous vehicles is inevitably challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13979v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13979v1-abstract-full" style="display: none;"> The emerging federated learning enables distributed autonomous vehicles to train equipped deep learning models collaboratively without exposing their raw data, providing great potential for utilizing explosively growing autonomous driving data. However, considering the complicated traffic environments and driving scenarios, deploying federated learning for autonomous vehicles is inevitably challenged by non-independent and identically distributed (Non-IID) data of vehicles, which may lead to failed convergence and low training accuracy. In this paper, we propose a novel hierarchically Federated Region-learning framework of Autonomous Vehicles (FedRAV), a two-stage framework, which adaptively divides a large area containing vehicles into sub-regions based on the defined region-wise distance, and achieves personalized vehicular models and regional models. This approach ensures that the personalized vehicular model adopts the beneficial models while discarding the unprofitable ones. We validate our FedRAV framework against existing federated learning algorithms on three real-world autonomous driving datasets in various heterogeneous settings. The experiment results demonstrate that our framework outperforms those known algorithms, and improves the accuracy by at least 3.69%. The source code of FedRAV is available at: https://github.com/yjzhai-cs/FedRAV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13979v1-abstract-full').style.display = 'none'; document.getElementById('2411.13979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12973">arXiv:2411.12973</a> <span> [<a href="https://arxiv.org/pdf/2411.12973">pdf</a>, <a href="https://arxiv.org/format/2411.12973">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Process-Guided Learning: An Application in Predicting Lake DO Concentrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runlong Yu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+C">Chonghao Qiu</a>, <a href="/search/cs?searchtype=author&query=Ladwig%2C+R">Robert Ladwig</a>, <a href="/search/cs?searchtype=author&query=Hanson%2C+P+C">Paul C. Hanson</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiqun Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanhua Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaowei Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12973v1-abstract-short" style="display: inline;"> This paper introduces a \textit{Process-Guided Learning (Pril)} framework that integrates physical models with recurrent neural networks (RNNs) to enhance the prediction of dissolved oxygen (DO) concentrations in lakes, which is crucial for sustaining water quality and ecosystem health. Unlike traditional RNNs, which may deliver high accuracy but often lack physical consistency and broad applicabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12973v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12973v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12973v1-abstract-full" style="display: none;"> This paper introduces a \textit{Process-Guided Learning (Pril)} framework that integrates physical models with recurrent neural networks (RNNs) to enhance the prediction of dissolved oxygen (DO) concentrations in lakes, which is crucial for sustaining water quality and ecosystem health. Unlike traditional RNNs, which may deliver high accuracy but often lack physical consistency and broad applicability, the \textit{Pril} method incorporates differential DO equations for each lake layer, modeling it as a first-order linear solution using a forward Euler scheme with a daily timestep. However, this method is sensitive to numerical instabilities. When drastic fluctuations occur, the numerical integration is neither mass-conservative nor stable. Especially during stratified conditions, exogenous fluxes into each layer cause significant within-day changes in DO concentrations. To address this challenge, we further propose an \textit{Adaptive Process-Guided Learning (April)} model, which dynamically adjusts timesteps from daily to sub-daily intervals with the aim of mitigating the discrepancies caused by variations in entrainment fluxes. \textit{April} uses a generator-discriminator architecture to identify days with significant DO fluctuations and employs a multi-step Euler scheme with sub-daily timesteps to effectively manage these variations. We have tested our methods on a wide range of lakes in the Midwestern USA, and demonstrated robust capability in predicting DO concentrations even with limited training data. While primarily focused on aquatic ecosystems, this approach is broadly applicable to diverse scientific and engineering disciplines that utilize process-based models, such as power engineering, climate science, and biomedicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12973v1-abstract-full').style.display = 'none'; document.getElementById('2411.12973v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11581">arXiv:2411.11581</a> <span> [<a href="https://arxiv.org/pdf/2411.11581">pdf</a>, <a href="https://arxiv.org/format/2411.11581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OASIS: Open Agent Social Interaction Simulations with One Million Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zirui Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxian Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziyue Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinsong Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Martz Ma</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chaochao Lu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11581v4-abstract-short" style="display: inline;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'inline'; document.getElementById('2411.11581v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11581v4-abstract-full" style="display: none;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a particular scenario, making it time-consuming and resource-intensive to explore other phenomena using the same ABM. Additionally, these models simulate only a limited number of agents, whereas real-world social media platforms involve millions of users. To this end, we propose OASIS, a generalizable and scalable social media simulator. OASIS is designed based on real-world social media platforms, incorporating dynamically updated environments (i.e., dynamic social networks and post information), diverse action spaces (i.e., following, commenting), and recommendation systems (i.e., interest-based and hot-score-based). Additionally, OASIS supports large-scale user simulations, capable of modeling up to one million users. With these features, OASIS can be easily extended to different social media platforms to study large-scale group phenomena and behaviors. We replicate various social phenomena, including information spreading, group polarization, and herd effects across X and Reddit platforms. Moreover, we provide observations of social phenomena at different agent group scales. We observe that the larger agent group scale leads to more enhanced group dynamics and more diverse and helpful agents' opinions. These findings demonstrate OASIS's potential as a powerful tool for studying complex systems in digital environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'none'; document.getElementById('2411.11581v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11474">arXiv:2411.11474</a> <span> [<a href="https://arxiv.org/pdf/2411.11474">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Graph Artificial Intelligence for Quantifying Compatibility Mechanisms in Traditional Chinese Medicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jingqi Zeng</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaobin Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11474v1-abstract-short" style="display: inline;"> Traditional Chinese Medicine (TCM) involves complex compatibility mechanisms characterized by multi-component and multi-target interactions, which are challenging to quantify. To address this challenge, we applied graph artificial intelligence to develop a TCM multi-dimensional knowledge graph that bridges traditional TCM theory and modern biomedical science (https://zenodo.org/records/13763953 ).… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11474v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11474v1-abstract-full" style="display: none;"> Traditional Chinese Medicine (TCM) involves complex compatibility mechanisms characterized by multi-component and multi-target interactions, which are challenging to quantify. To address this challenge, we applied graph artificial intelligence to develop a TCM multi-dimensional knowledge graph that bridges traditional TCM theory and modern biomedical science (https://zenodo.org/records/13763953 ). Using feature engineering and embedding, we processed key TCM terminology and Chinese herbal pieces (CHP), introducing medicinal properties as virtual nodes and employing graph neural networks with attention mechanisms to model and analyze 6,080 Chinese herbal formulas (CHF). Our method quantitatively assessed the roles of CHP within CHF and was validated using 215 CHF designed for COVID-19 management. With interpretable models, open-source data, and code (https://github.com/ZENGJingqi/GraphAI-for-TCM ), this study provides robust tools for advancing TCM theory and drug discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11474v1-abstract-full').style.display = 'none'; document.getElementById('2411.11474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures. Includes open-source dataset and code for reproducibility</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 92C42 (Systems biology; networks); 68T07 (Artificial intelligence and machine learning) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.7; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03814">arXiv:2411.03814</a> <span> [<a href="https://arxiv.org/pdf/2411.03814">pdf</a>, <a href="https://arxiv.org/format/2411.03814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Peng Xiao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">YueFeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chongwen Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jialing Tao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hui Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03814v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate outstanding performance in their reservoir of knowledge and understanding capabilities, but they have also been shown to be prone to illegal or unethical reactions when subjected to jailbreak attacks. To ensure their responsible deployment in critical applications, it is crucial to understand the safety capabilities and vulnerabilities of LLMs. Previous wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03814v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate outstanding performance in their reservoir of knowledge and understanding capabilities, but they have also been shown to be prone to illegal or unethical reactions when subjected to jailbreak attacks. To ensure their responsible deployment in critical applications, it is crucial to understand the safety capabilities and vulnerabilities of LLMs. Previous works mainly focus on jailbreak in single-round dialogue, overlooking the potential jailbreak risks in multi-round dialogues, which are a vital way humans interact with and extract information from LLMs. Some studies have increasingly concentrated on the risks associated with jailbreak in multi-round dialogues. These efforts typically involve the use of manually crafted templates or prompt engineering techniques. However, due to the inherent complexity of multi-round dialogues, their jailbreak performance is limited. To solve this problem, we propose a novel multi-round dialogue jailbreaking agent, emphasizing the importance of stealthiness in identifying and mitigating potential threats to human values posed by LLMs. We propose a risk decomposition strategy that distributes risks across multiple rounds of queries and utilizes psychological strategies to enhance attack strength. Extensive experiments show that our proposed method surpasses other attack methods and achieves state-of-the-art attack success rate. We will make the corresponding code and dataset available for future research. The code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03814v1-abstract-full').style.display = 'none'; document.getElementById('2411.03814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02863">arXiv:2411.02863</a> <span> [<a href="https://arxiv.org/pdf/2411.02863">pdf</a>, <a href="https://arxiv.org/format/2411.02863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> LoopSCC: Towards Summarizing Multi-branch Loops within Determinate Cycles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenkai Guo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kuihao Yan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaoqi Jia</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Haichao Du</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingjia Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yamin Xie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jing Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02863v1-abstract-short" style="display: inline;"> Analyzing programs with loops is a challenging task, suffering from potential issues such as indeterminate number of iterations and exponential growth of control flow complexity. Loop summarization, as a static analysis method for concrete semantic interpretation, receives increasing focuses. It produces symbolic expressions semantically equivalent to the loop program. However, current loop summar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02863v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02863v1-abstract-full" style="display: none;"> Analyzing programs with loops is a challenging task, suffering from potential issues such as indeterminate number of iterations and exponential growth of control flow complexity. Loop summarization, as a static analysis method for concrete semantic interpretation, receives increasing focuses. It produces symbolic expressions semantically equivalent to the loop program. However, current loop summarization methods are only suitable for single-branch loops or multi-branch loops with simple cycles, without supporting complex loops with irregular branch-to-branch transitions. In this paper, we proposed LoopSCC, a novel loop summarization technique, to achieve concrete semantic interpretation on complex loop. LoopSCC analyzes the control flow at the granularity of single-loop-path and applies the strongly connected components (SCC for short) for contraction and simplification, resulting in the contracted single-loop-path graph (CSG for short). Based on the control flow information provided by the CSG, we can convert the loop summary into a combination of SCC summaries. When an SCC contains irregular branch-to-branch transitions, we propose to explore a convergent range to identify the determinate cycles of different execution paths, referred as oscillatory interval. The loop summarization composed of both iteration conditions and execution operations can eventually be derived recursively. Extensive experiments compared to six state-of-the-art loop interpretation methods are conducted to evaluate the effectiveness of LoopSCC. From the results, LoopSCC outperforms comparative methods in both interpretation accuracy and application effectiveness. Especially, LoopSCC achieves a 100% interpretation accuracy on public common-used benchmark. A systematical study for loop properties on three large-scale programs illustrates that LoopSCC presents outstanding scalability for real-world loop programs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02863v1-abstract-full').style.display = 'none'; document.getElementById('2411.02863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02669">arXiv:2411.02669</a> <span> [<a href="https://arxiv.org/pdf/2411.02669">pdf</a>, <a href="https://arxiv.org/format/2411.02669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic-Aligned Adversarial Evolution Triangle for High-Transferability Vision-Language Attack </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Sensen Gao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Ke Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihao Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Simeng Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Fellow%2C+I+T">Ivor Tsang Fellow</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02669v1-abstract-short" style="display: inline;"> Vision-language pre-training (VLP) models excel at interpreting both images and text but remain vulnerable to multimodal adversarial examples (AEs). Advancing the generation of transferable AEs, which succeed across unseen models, is key to developing more robust and practical VLP models. Previous approaches augment image-text pairs to enhance diversity within the adversarial example generation pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02669v1-abstract-full" style="display: none;"> Vision-language pre-training (VLP) models excel at interpreting both images and text but remain vulnerable to multimodal adversarial examples (AEs). Advancing the generation of transferable AEs, which succeed across unseen models, is key to developing more robust and practical VLP models. Previous approaches augment image-text pairs to enhance diversity within the adversarial example generation process, aiming to improve transferability by expanding the contrast space of image-text features. However, these methods focus solely on diversity around the current AEs, yielding limited gains in transferability. To address this issue, we propose to increase the diversity of AEs by leveraging the intersection regions along the adversarial trajectory during optimization. Specifically, we propose sampling from adversarial evolution triangles composed of clean, historical, and current adversarial examples to enhance adversarial diversity. We provide a theoretical analysis to demonstrate the effectiveness of the proposed adversarial evolution triangle. Moreover, we find that redundant inactive dimensions can dominate similarity calculations, distorting feature matching and making AEs model-dependent with reduced transferability. Hence, we propose to generate AEs in the semantic image-text feature contrast space, which can project the original feature space into a semantic corpus subspace. The proposed semantic-aligned subspace can reduce the image feature redundancy, thereby improving adversarial transferability. Extensive experiments across different datasets and models demonstrate that the proposed method can effectively improve adversarial transferability and outperform state-of-the-art adversarial attack methods. The code is released at https://github.com/jiaxiaojunQAQ/SA-AET. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02669v1-abstract-full').style.display = 'none'; document.getElementById('2411.02669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22445">arXiv:2410.22445</a> <span> [<a href="https://arxiv.org/pdf/2410.22445">pdf</a>, <a href="https://arxiv.org/format/2410.22445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Embedding Watermarks in Diffusion Process for Model Intellectual Property Protection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jijia Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sen Peng</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaohua Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22445v1-abstract-short" style="display: inline;"> In practical application, the widespread deployment of diffusion models often necessitates substantial investment in training. As diffusion models find increasingly diverse applications, concerns about potential misuse highlight the imperative for robust intellectual property protection. Current protection strategies either employ backdoor-based methods, integrating a watermark task as a simpler t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22445v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22445v1-abstract-full" style="display: none;"> In practical application, the widespread deployment of diffusion models often necessitates substantial investment in training. As diffusion models find increasingly diverse applications, concerns about potential misuse highlight the imperative for robust intellectual property protection. Current protection strategies either employ backdoor-based methods, integrating a watermark task as a simpler training objective with the main model task, or embedding watermarks directly into the final output samples. However, the former approach is fragile compared to existing backdoor defense techniques, while the latter fundamentally alters the expected output. In this work, we introduce a novel watermarking framework by embedding the watermark into the whole diffusion process, and theoretically ensure that our final output samples contain no additional information. Furthermore, we utilize statistical algorithms to verify the watermark from internally generated model samples without necessitating triggers as conditions. Detailed theoretical analysis and experimental validation demonstrate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22445v1-abstract-full').style.display = 'none'; document.getElementById('2410.22445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18378">arXiv:2410.18378</a> <span> [<a href="https://arxiv.org/pdf/2410.18378">pdf</a>, <a href="https://arxiv.org/format/2410.18378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Delta: A Cloud-assisted Data Enrichment Framework for On-Device Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chen Gong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhenzhe Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaofeng Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guihai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18378v1-abstract-short" style="display: inline;"> In modern mobile applications, users frequently encounter various new contexts, necessitating on-device continual learning (CL) to ensure consistent model performance. While existing research predominantly focused on developing lightweight CL frameworks, we identify that data scarcity is a critical bottleneck for on-device CL. In this work, we explore the potential of leveraging abundant cloud-sid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18378v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18378v1-abstract-full" style="display: none;"> In modern mobile applications, users frequently encounter various new contexts, necessitating on-device continual learning (CL) to ensure consistent model performance. While existing research predominantly focused on developing lightweight CL frameworks, we identify that data scarcity is a critical bottleneck for on-device CL. In this work, we explore the potential of leveraging abundant cloud-side data to enrich scarce on-device data, and propose a private, efficient and effective data enrichment framework Delta. Specifically, Delta first introduces a directory dataset to decompose the data enrichment problem into device-side and cloud-side sub-problems without sharing sensitive data. Next, Delta proposes a soft data matching strategy to effectively solve the device-side sub-problem with sparse user data, and an optimal data sampling scheme for cloud server to retrieve the most suitable dataset for enrichment with low computational complexity. Further, Delta refines the data sampling scheme by jointly considering the impact of enriched data on both new and past contexts, mitigating the catastrophic forgetting issue from a new aspect. Comprehensive experiments across four typical mobile computing tasks with varied data modalities demonstrate that Delta could enhance the overall model accuracy by an average of 15.1%, 12.4%, 1.1% and 5.6% for visual, IMU, audio and textual tasks compared with few-shot CL, and consistently reduce the communication costs by over 90% compared to federated CL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18378v1-abstract-full').style.display = 'none'; document.getElementById('2410.18378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17742">arXiv:2410.17742</a> <span> [<a href="https://arxiv.org/pdf/2410.17742">pdf</a>, <a href="https://arxiv.org/format/2410.17742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multi-Layered Safety of Redundant Robot Manipulators via Task-Oriented Planning and Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xinyu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxin Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jun Yang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yongping Pan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haoyong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17742v1-abstract-short" style="display: inline;"> Ensuring safety is crucial to promote the application of robot manipulators in open workspace. Factors such as sensor errors or unpredictable collisions make the environment full of uncertainties. In this work, we investigate these potential safety challenges on redundant robot manipulators, and propose a task-oriented planning and control framework to achieve multi-layered safety while maintainin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17742v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17742v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17742v1-abstract-full" style="display: none;"> Ensuring safety is crucial to promote the application of robot manipulators in open workspace. Factors such as sensor errors or unpredictable collisions make the environment full of uncertainties. In this work, we investigate these potential safety challenges on redundant robot manipulators, and propose a task-oriented planning and control framework to achieve multi-layered safety while maintaining efficient task execution. Our approach consists of two main parts: a task-oriented trajectory planner based on multiple-shooting model predictive control method, and a torque controller that allows safe and efficient collision reaction using only proprioceptive data. Through extensive simulations and real-hardware experiments, we demonstrate that the proposed framework can effectively handle uncertain static or dynamic obstacles, and perform disturbance resistance in manipulation tasks when unforeseen contacts occur. All code will be open-sourced to benefit the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17742v1-abstract-full').style.display = 'none'; document.getElementById('2410.17742v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14137">arXiv:2410.14137</a> <span> [<a href="https://arxiv.org/pdf/2410.14137">pdf</a>, <a href="https://arxiv.org/format/2410.14137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Conditional Multi-Task Learning for Streamflow Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaoming Xu</a>, <a href="/search/cs?searchtype=author&query=Renganathan%2C+A">Arvind Renganathan</a>, <a href="/search/cs?searchtype=author&query=Khandelwal%2C+A">Ankush Khandelwal</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+R">Rahul Ghosh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Licheng Liu</a>, <a href="/search/cs?searchtype=author&query=Tayal%2C+K">Kshitij Tayal</a>, <a href="/search/cs?searchtype=author&query=Harrington%2C+P">Peter Harrington</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaowei Jia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhenong Jin</a>, <a href="/search/cs?searchtype=author&query=Nieber%2C+J">Jonh Nieber</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+V">Vipin Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14137v1-abstract-short" style="display: inline;"> Streamflow, vital for water resource management, is governed by complex hydrological systems involving intermediate processes driven by meteorological forces. While deep learning models have achieved state-of-the-art results of streamflow prediction, their end-to-end single-task learning approach often fails to capture the causal relationships within these systems. To address this, we propose Hier… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14137v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14137v1-abstract-full" style="display: none;"> Streamflow, vital for water resource management, is governed by complex hydrological systems involving intermediate processes driven by meteorological forces. While deep learning models have achieved state-of-the-art results of streamflow prediction, their end-to-end single-task learning approach often fails to capture the causal relationships within these systems. To address this, we propose Hierarchical Conditional Multi-Task Learning (HCMTL), a hierarchical approach that jointly models soil water and snowpack processes based on their causal connections to streamflow. HCMTL utilizes task embeddings to connect network modules, enhancing flexibility and expressiveness while capturing unobserved processes beyond soil water and snowpack. It also incorporates the Conditional Mini-Batch strategy to improve long time series modeling. We compare HCMTL with five baselines on a global dataset. HCMTL's superior performance across hundreds of drainage basins over extended periods shows that integrating domain-specific causal knowledge into deep learning enhances both prediction accuracy and interpretability. This is essential for advancing our understanding of complex hydrological systems and supporting efficient water resource management to mitigate natural disasters like droughts and floods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14137v1-abstract-full').style.display = 'none'; document.getElementById('2410.14137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12184">arXiv:2410.12184</a> <span> [<a href="https://arxiv.org/pdf/2410.12184">pdf</a>, <a href="https://arxiv.org/format/2410.12184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExoTST: Exogenous-Aware Temporal Sequence Transformer for Time Series Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tayal%2C+K">Kshitij Tayal</a>, <a href="/search/cs?searchtype=author&query=Renganathan%2C+A">Arvind Renganathan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaowei Jia</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+V">Vipin Kumar</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12184v1-abstract-short" style="display: inline;"> Accurate long-term predictions are the foundations for many machine learning applications and decision-making processes. Traditional time series approaches for prediction often focus on either autoregressive modeling, which relies solely on past observations of the target ``endogenous variables'', or forward modeling, which considers only current covariate drivers ``exogenous variables''. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12184v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12184v1-abstract-full" style="display: none;"> Accurate long-term predictions are the foundations for many machine learning applications and decision-making processes. Traditional time series approaches for prediction often focus on either autoregressive modeling, which relies solely on past observations of the target ``endogenous variables'', or forward modeling, which considers only current covariate drivers ``exogenous variables''. However, effectively integrating past endogenous and past exogenous with current exogenous variables remains a significant challenge. In this paper, we propose ExoTST, a novel transformer-based framework that effectively incorporates current exogenous variables alongside past context for improved time series prediction. To integrate exogenous information efficiently, ExoTST leverages the strengths of attention mechanisms and introduces a novel cross-temporal modality fusion module. This module enables the model to jointly learn from both past and current exogenous series, treating them as distinct modalities. By considering these series separately, ExoTST provides robustness and flexibility in handling data uncertainties that arise from the inherent distribution shift between historical and current exogenous variables. Extensive experiments on real-world carbon flux datasets and time series benchmarks demonstrate ExoTST's superior performance compared to state-of-the-art baselines, with improvements of up to 10\% in prediction accuracy. Moreover, ExoTST exhibits strong robustness against missing values and noise in exogenous drivers, maintaining consistent performance in real-world situations where these imperfections are common. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12184v1-abstract-full').style.display = 'none'; document.getElementById('2410.12184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICDM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11824">arXiv:2410.11824</a> <span> [<a href="https://arxiv.org/pdf/2410.11824">pdf</a>, <a href="https://arxiv.org/format/2410.11824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KITTEN: A Knowledge-Intensive Evaluation of Image Generation on Visual Entities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hsin-Ping Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Taitelbaum%2C+H">Hagai Taitelbaum</a>, <a href="/search/cs?searchtype=author&query=Tomar%2C+G+S">Gaurav Singh Tomar</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+M">Ming-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K+C+K">Kelvin C. K. Chan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu-Chuan Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11824v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image generation have significantly enhanced the quality of synthesized images. Despite this progress, evaluations predominantly focus on aesthetic appeal or alignment with text prompts. Consequently, there is limited understanding of whether these models can accurately represent a wide variety of realistic visual entities - a task requiring real-world knowledge. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11824v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11824v1-abstract-full" style="display: none;"> Recent advancements in text-to-image generation have significantly enhanced the quality of synthesized images. Despite this progress, evaluations predominantly focus on aesthetic appeal or alignment with text prompts. Consequently, there is limited understanding of whether these models can accurately represent a wide variety of realistic visual entities - a task requiring real-world knowledge. To address this gap, we propose a benchmark focused on evaluating Knowledge-InTensive image generaTion on real-world ENtities (i.e., KITTEN). Using KITTEN, we conduct a systematic study on the fidelity of entities in text-to-image generation models, focusing on their ability to generate a wide range of real-world visual entities, such as landmark buildings, aircraft, plants, and animals. We evaluate the latest text-to-image models and retrieval-augmented customization models using both automatic metrics and carefully-designed human evaluations, with an emphasis on the fidelity of entities in the generated images. Our findings reveal that even the most advanced text-to-image models often fail to generate entities with accurate visual details. Although retrieval-augmented models can enhance the fidelity of entity by incorporating reference images during testing, they often over-rely on these references and struggle to produce novel configurations of the entity as requested in creative text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11824v1-abstract-full').style.display = 'none'; document.getElementById('2410.11824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kitten-project.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09543">arXiv:2410.09543</a> <span> [<a href="https://arxiv.org/pdf/2410.09543">pdf</a>, <a href="https://arxiv.org/format/2410.09543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Boltzmann-Aligned Inverse Folding Model as a Predictor of Mutational Effects on Protein-Protein Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xiaoran Jiao</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Weian Mao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Wengong Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Peiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09543v1-abstract-short" style="display: inline;"> Predicting the change in binding free energy ($螖螖G$) is crucial for understanding and modulating protein-protein interactions, which are critical in drug design. Due to the scarcity of experimental $螖螖G$ data, existing methods focus on pre-training, while neglecting the importance of alignment. In this work, we propose the Boltzmann Alignment technique to transfer knowledge from pre-trained invers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09543v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09543v1-abstract-full" style="display: none;"> Predicting the change in binding free energy ($螖螖G$) is crucial for understanding and modulating protein-protein interactions, which are critical in drug design. Due to the scarcity of experimental $螖螖G$ data, existing methods focus on pre-training, while neglecting the importance of alignment. In this work, we propose the Boltzmann Alignment technique to transfer knowledge from pre-trained inverse folding models to $螖螖G$ prediction. We begin by analyzing the thermodynamic definition of $螖螖G$ and introducing the Boltzmann distribution to connect energy with protein conformational distribution. However, the protein conformational distribution is intractable; therefore, we employ Bayes' theorem to circumvent direct estimation and instead utilize the log-likelihood provided by protein inverse folding models for $螖螖G$ estimation. Compared to previous inverse folding-based methods, our method explicitly accounts for the unbound state of protein complex in the $螖螖G$ thermodynamic cycle, introducing a physical inductive bias and achieving both supervised and unsupervised state-of-the-art (SoTA) performance. Experimental results on SKEMPI v2 indicate that our method achieves Spearman coefficients of 0.3201 (unsupervised) and 0.5134 (supervised), significantly surpassing the previously reported SoTA values of 0.2632 and 0.4324, respectively. Futhermore, we demonstrate the capability of our method on binding energy prediction, protein-protein docking and antibody optimization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09543v1-abstract-full').style.display = 'none'; document.getElementById('2410.09543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08262">arXiv:2410.08262</a> <span> [<a href="https://arxiv.org/pdf/2410.08262">pdf</a>, <a href="https://arxiv.org/format/2410.08262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ROMAN: Open-Set Object Map Alignment for Robust View-Invariant Global Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peterson%2C+M+B">Mason B. Peterson</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y+X">Yi Xuan Jia</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yulun Tian</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+A">Annika Thomas</a>, <a href="/search/cs?searchtype=author&query=How%2C+J+P">Jonathan P. How</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08262v1-abstract-short" style="display: inline;"> Global localization is a fundamental capability required for long-term and drift-free robot navigation. However, current methods fail to relocalize when faced with significantly different viewpoints. We present ROMAN (Robust Object Map Alignment Anywhere), a robust global localization method capable of localizing in challenging and diverse environments based on creating and aligning maps of open-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08262v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08262v1-abstract-full" style="display: none;"> Global localization is a fundamental capability required for long-term and drift-free robot navigation. However, current methods fail to relocalize when faced with significantly different viewpoints. We present ROMAN (Robust Object Map Alignment Anywhere), a robust global localization method capable of localizing in challenging and diverse environments based on creating and aligning maps of open-set and view-invariant objects. To address localization difficulties caused by feature-sparse or perceptually aliased environments, ROMAN formulates and solves a registration problem between object submaps using a unified graph-theoretic global data association approach that simultaneously accounts for object shape and semantic similarities and a prior on gravity direction. Through a set of challenging large-scale multi-robot or multi-session SLAM experiments in indoor, urban and unstructured/forested environments, we demonstrate that ROMAN achieves a maximum recall 36% higher than other object-based map alignment methods and an absolute trajectory error that is 37% lower than using visual features for loop closures. Our project page can be found at https://acl.mit.edu/ROMAN/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08262v1-abstract-full').style.display = 'none'; document.getElementById('2410.08262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04081">arXiv:2410.04081</a> <span> [<a href="https://arxiv.org/pdf/2410.04081">pdf</a>, <a href="https://arxiv.org/format/2410.04081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> $蔚$-VAE: Denoising as Visual Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Long Zhao</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yandong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Boqing Gong</a>, <a href="/search/cs?searchtype=author&query=Adam%2C+H">Hartwig Adam</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04081v1-abstract-short" style="display: inline;"> In generative modeling, tokenization simplifies complex data into compact, structured representations, creating a more efficient, learnable space. For high-dimensional visual data, it reduces redundancy and emphasizes key features for high-quality generation. Current visual tokenization methods rely on a traditional autoencoder framework, where the encoder compresses data into latent representatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04081v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04081v1-abstract-full" style="display: none;"> In generative modeling, tokenization simplifies complex data into compact, structured representations, creating a more efficient, learnable space. For high-dimensional visual data, it reduces redundancy and emphasizes key features for high-quality generation. Current visual tokenization methods rely on a traditional autoencoder framework, where the encoder compresses data into latent representations, and the decoder reconstructs the original input. In this work, we offer a new perspective by proposing denoising as decoding, shifting from single-step reconstruction to iterative refinement. Specifically, we replace the decoder with a diffusion process that iteratively refines noise to recover the original image, guided by the latents provided by the encoder. We evaluate our approach by assessing both reconstruction (rFID) and generation quality (FID), comparing it to state-of-the-art autoencoding approach. We hope this work offers new insights into integrating iterative generation and autoencoding for improved compression and generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04081v1-abstract-full').style.display = 'none'; document.getElementById('2410.04081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03751">arXiv:2410.03751</a> <span> [<a href="https://arxiv.org/pdf/2410.03751">pdf</a>, <a href="https://arxiv.org/format/2410.03751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances in Speech Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+W">Wenqian Cui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dianzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xiaoqi Jiao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Ziqiao Meng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guangyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qichao Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yiwen Guo</a>, <a href="/search/cs?searchtype=author&query=King%2C+I">Irwin King</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03751v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have recently garnered significant attention, primarily for their capabilities in text-based interactions. However, natural human interaction often relies on speech, necessitating a shift towards voice-based models. A straightforward approach to achieve this involves a pipeline of ``Automatic Speech Recognition (ASR) + LLM + Text-to-Speech (TTS)", where input speech is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03751v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03751v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have recently garnered significant attention, primarily for their capabilities in text-based interactions. However, natural human interaction often relies on speech, necessitating a shift towards voice-based models. A straightforward approach to achieve this involves a pipeline of ``Automatic Speech Recognition (ASR) + LLM + Text-to-Speech (TTS)", where input speech is transcribed to text, processed by an LLM, and then converted back to speech. Despite being straightforward, this method suffers from inherent limitations, such as information loss during modality conversion and error accumulation across the three stages. To address these issues, Speech Language Models (SpeechLMs) -- end-to-end models that generate speech without converting from text -- have emerged as a promising alternative. This survey paper provides the first comprehensive overview of recent methodologies for constructing SpeechLMs, detailing the key components of their architecture and the various training recipes integral to their development. Additionally, we systematically survey the various capabilities of SpeechLMs, categorize the evaluation metrics for SpeechLMs, and discuss the challenges and future research directions in this rapidly evolving field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03751v1-abstract-full').style.display = 'none'; document.getElementById('2410.03751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20078">arXiv:2409.20078</a> <span> [<a href="https://arxiv.org/pdf/2409.20078">pdf</a>, <a href="https://arxiv.org/format/2409.20078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Quantifying discriminability of evaluation metrics in link prediction for real networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wan%2C+S">Shuyan Wan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Y">Yilin Bi</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xinshan Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tao Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20078v1-abstract-short" style="display: inline;"> Link prediction is one of the most productive branches in network science, aiming to predict links that would have existed but have not yet been observed, or links that will appear during the evolution of the network. Over nearly two decades, the field of link prediction has amassed a substantial body of research, encompassing a plethora of algorithms and diverse applications. For any algorithm, o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20078v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20078v1-abstract-full" style="display: none;"> Link prediction is one of the most productive branches in network science, aiming to predict links that would have existed but have not yet been observed, or links that will appear during the evolution of the network. Over nearly two decades, the field of link prediction has amassed a substantial body of research, encompassing a plethora of algorithms and diverse applications. For any algorithm, one or more evaluation metrics are required to assess its performance. Because using different evaluation metrics can provide different assessments of the algorithm performance, how to select appropriate evaluation metrics is a fundamental issue in link prediction. To address this issue, we propose a novel measure that quantifiers the discriminability of any evaluation metric given a real network and an algorithm. Based on 131 real networks and 20 representative algorithms, we systematically compare the discriminabilities of eight evaluation metrics, and demonstrate that H-measure and Area Under the ROC Curve (AUC) exhibit the strongest discriminabilities, followed by Normalized Discounted Cumulative Gain (NDCG). Our finding is robust for networks in different domains and algorithms of different types. This study provides insights into the selection of evaluation metrics, which may further contribute to standardizing the evaluating process of link prediction algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20078v1-abstract-full').style.display = 'none'; document.getElementById('2409.20078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17601">arXiv:2409.17601</a> <span> [<a href="https://arxiv.org/pdf/2409.17601">pdf</a>, <a href="https://arxiv.org/format/2409.17601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CleanerCLIP: Fine-grained Counterfactual Semantic Augmentation for Backdoor Defense in Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xun%2C+Y">Yuan Xun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinwei Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17601v3-abstract-short" style="display: inline;"> Pre-trained large models for multimodal contrastive learning, such as CLIP, have been widely recognized in the industry as highly susceptible to data-poisoned backdoor attacks. This poses significant risks to downstream model training. In response to such potential threats, finetuning offers a simpler and more efficient defense choice compared to retraining large models with augmented data. In the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17601v3-abstract-full').style.display = 'inline'; document.getElementById('2409.17601v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17601v3-abstract-full" style="display: none;"> Pre-trained large models for multimodal contrastive learning, such as CLIP, have been widely recognized in the industry as highly susceptible to data-poisoned backdoor attacks. This poses significant risks to downstream model training. In response to such potential threats, finetuning offers a simpler and more efficient defense choice compared to retraining large models with augmented data. In the supervised learning domain, fine-tuning defense strategies can achieve excellent defense performance. However, in the unsupervised and semi-supervised domain, we find that when CLIP faces some complex attack techniques, the existing fine-tuning defense strategy, CleanCLIP, has some limitations on defense performance. The synonym substitution of its text-augmentation is insufficient to enhance the text feature space. To compensate for this weakness, we improve it by proposing a fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to cut off feature connections of backdoor triggers. We randomly select a few samples for positive and negative subtext generation at each epoch of CleanCLIP, and align the subtexts to the images to strengthen the text self-supervision. We evaluate the effectiveness of our TA-Cleaner against six attack algorithms and conduct comprehensive zero-shot classification tests on ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves state-of-the-art defensiveness among finetuning-based defense techniques. Even when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17601v3-abstract-full').style.display = 'none'; document.getElementById('2409.17601v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12490">arXiv:2409.12490</a> <span> [<a href="https://arxiv.org/pdf/2409.12490">pdf</a>, <a href="https://arxiv.org/format/2409.12490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CritiPrefill: A Segment-wise Criticality-based Approach for Prefilling Acceleration in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+J">Junlin Lv</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuan Feng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xike Xie</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xin Jia</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qirong Peng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+G">Guiming Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12490v2-abstract-short" style="display: inline;"> Large language models have achieved notable success across various domains, yet efficient inference is still limited by the quadratic computation complexity of the attention mechanism. The inference consists of prefilling and decoding phases. Although several attempts have been made to accelerate decoding, the inefficiency of the prefilling phase, especially for long-context tasks, remains a chall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12490v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12490v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12490v2-abstract-full" style="display: none;"> Large language models have achieved notable success across various domains, yet efficient inference is still limited by the quadratic computation complexity of the attention mechanism. The inference consists of prefilling and decoding phases. Although several attempts have been made to accelerate decoding, the inefficiency of the prefilling phase, especially for long-context tasks, remains a challenge. In this paper, we observe a locality in query criticality during the prefilling phase of long-context processing: adjacent query tokens tend to focus on similar subsets of the past Key-Value (KV) cache. Based on this observation, we propose CritiPrefill, a criticality-based segment-wise prefilling method. This method partitions the input sequence's queries and KV cache into segments and blocks, utilizing a segment-wise algorithm to estimate the query criticality. By pruning non-critical computations between query segments and cache blocks in the self-attention mechanism, the prefilling process can be significantly accelerated. Extensive evaluations on multiple long-context datasets show up to 2.7x speedup on Llama3-8B and 3.0x speedup on Yi-9B for 128K context length on a single A100 GPU, with minimal quality degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12490v2-abstract-full').style.display = 'none'; document.getElementById('2409.12490v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11663">arXiv:2409.11663</a> <span> [<a href="https://arxiv.org/pdf/2409.11663">pdf</a>, <a href="https://arxiv.org/format/2409.11663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GReDP: A More Robust Approach for Differential Private Training with Gradient-Preserving Noise Reduction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haodi Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tangyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yu Guo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chengjun Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaohua Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11663v2-abstract-short" style="display: inline;"> Deep learning models have been extensively adopted in various regions due to their ability to represent hierarchical features, which highly rely on the training set and procedures. Thus, protecting the training process and deep learning algorithms is paramount in privacy preservation. Although Differential Privacy (DP) as a powerful cryptographic primitive has achieved satisfying results in deep l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11663v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11663v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11663v2-abstract-full" style="display: none;"> Deep learning models have been extensively adopted in various regions due to their ability to represent hierarchical features, which highly rely on the training set and procedures. Thus, protecting the training process and deep learning algorithms is paramount in privacy preservation. Although Differential Privacy (DP) as a powerful cryptographic primitive has achieved satisfying results in deep learning training, the existing schemes still fall short in preserving model utility, i.e., they either invoke a high noise scale or inevitably harm the original gradients. To address the above issues, in this paper, we present a more robust approach for DP training called GReDP. Specifically, we compute the model gradients in the frequency domain and adopt a new approach to reduce the noise level. Unlike the previous work, our GReDP only requires half of the noise scale compared to DPSGD [1] while keeping all the gradient information intact. We present a detailed analysis of our method both theoretically and empirically. The experimental results show that our GReDP works consistently better than the baselines on all models and training settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11663v2-abstract-full').style.display = 'none'; document.getElementById('2409.11663v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01546">arXiv:2409.01546</a> <span> [<a href="https://arxiv.org/pdf/2409.01546">pdf</a>, <a href="https://arxiv.org/ps/2409.01546">ps</a>, <a href="https://arxiv.org/format/2409.01546">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> The category of well-filtered dcpos is not $螕$-faithful </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+H">Hualin Miao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+H">Huijun Hou</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaodong Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingguo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01546v1-abstract-short" style="display: inline;"> The Ho-Zhao problem asks whether any two dcpo's with isomorphic Scott closed set lattices are themselves isomorphic, that is, whether the category $\mathbf{DCPO}$ of dcpo's and Scott-continuous maps is $螕$-faithful. In 2018, Ho, Goubault-Larrecq, Jung and Xi answered this question in the negative, and they introduced the category $\mathbf{DOMI}$ of dominated dcpo's and proved that it is {$螕$-faith… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01546v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01546v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01546v1-abstract-full" style="display: none;"> The Ho-Zhao problem asks whether any two dcpo's with isomorphic Scott closed set lattices are themselves isomorphic, that is, whether the category $\mathbf{DCPO}$ of dcpo's and Scott-continuous maps is $螕$-faithful. In 2018, Ho, Goubault-Larrecq, Jung and Xi answered this question in the negative, and they introduced the category $\mathbf{DOMI}$ of dominated dcpo's and proved that it is {$螕$-faithful}. Dominated dcpo's subsume many familiar families of dcpo's in domain theory, such as the category of bounded-complete dcpo's and that of sober dcpo's, among others. However, it is unknown whether the category of dominated dcpo's dominates all well-filtered dcpo's, a class strictly larger than that of bounded-complete lattices and that of sober dcpo's. In this paper, we address this very natural question and show that the category $\mathbf{WF}$ of well-filtered dcpo's is not $螕$-faithful, and as a result of it, well-filtered dcpo's need not be dominated in general. Since not all dcpo's are well-filtered, our work refines the results of Ho, Goubault-Larrecq, Jung and Xi. As a second contribution, we confirm that the Lawson's category of $惟^{*}$-compact dcpo's is $螕$-faithful. Moreover, we locate a class of dcpo's which we call weakly dominated dcpo's, and show that this class is $螕$-faithful and strictly larger than $\mathbf{DOMI}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01546v1-abstract-full').style.display = 'none'; document.getElementById('2409.01546v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01100">arXiv:2409.01100</a> <span> [<a href="https://arxiv.org/pdf/2409.01100">pdf</a>, <a href="https://arxiv.org/format/2409.01100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OCMG-Net: Neural Oriented Normal Refinement for Unstructured Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yingrui Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+W">Weize Quan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jian Shi</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaohong Jia</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Dong-Ming Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01100v1-abstract-short" style="display: inline;"> We present a robust refinement method for estimating oriented normals from unstructured point clouds. In contrast to previous approaches that either suffer from high computational complexity or fail to achieve desirable accuracy, our novel framework incorporates sign orientation and data augmentation in the feature space to refine the initial oriented normals, striking a balance between efficiency… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01100v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01100v1-abstract-full" style="display: none;"> We present a robust refinement method for estimating oriented normals from unstructured point clouds. In contrast to previous approaches that either suffer from high computational complexity or fail to achieve desirable accuracy, our novel framework incorporates sign orientation and data augmentation in the feature space to refine the initial oriented normals, striking a balance between efficiency and accuracy. To address the issue of noise-caused direction inconsistency existing in previous approaches, we introduce a new metric called the Chamfer Normal Distance, which faithfully minimizes the estimation error by correcting the annotated normal with the closest point found on the potentially clean point cloud. This metric not only tackles the challenge but also aids in network training and significantly enhances network robustness against noise. Moreover, we propose an innovative dual-parallel architecture that integrates Multi-scale Local Feature Aggregation and Hierarchical Geometric Information Fusion, which enables the network to capture intricate geometric details more effectively and notably reduces ambiguity in scale selection. Extensive experiments demonstrate the superiority and versatility of our method in both unoriented and oriented normal estimation tasks across synthetic and real-world datasets among indoor and outdoor scenarios. The code is available at https://github.com/YingruiWoo/OCMG-Net.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01100v1-abstract-full').style.display = 'none'; document.getElementById('2409.01100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 16 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13991">arXiv:2408.13991</a> <span> [<a href="https://arxiv.org/pdf/2408.13991">pdf</a>, <a href="https://arxiv.org/format/2408.13991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dual-CBA: Improving Online Continual Learning via Dual Continual Bias Adaptors from a Bi-level Optimization Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Quanziang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Renzhen Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yichen Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xixi Jia</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Minghao Zhou</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13991v1-abstract-short" style="display: inline;"> In online continual learning (CL), models trained on changing distributions easily forget previously learned knowledge and bias toward newly received tasks. To address this issue, we present Continual Bias Adaptor (CBA), a bi-level framework that augments the classification network to adapt to catastrophic distribution shifts during training, enabling the network to achieve a stable consolidation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13991v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13991v1-abstract-full" style="display: none;"> In online continual learning (CL), models trained on changing distributions easily forget previously learned knowledge and bias toward newly received tasks. To address this issue, we present Continual Bias Adaptor (CBA), a bi-level framework that augments the classification network to adapt to catastrophic distribution shifts during training, enabling the network to achieve a stable consolidation of all seen tasks. However, the CBA module adjusts distribution shifts in a class-specific manner, exacerbating the stability gap issue and, to some extent, fails to meet the need for continual testing in online CL. To mitigate this challenge, we further propose a novel class-agnostic CBA module that separately aggregates the posterior probabilities of classes from new and old tasks, and applies a stable adjustment to the resulting posterior probabilities. We combine the two kinds of CBA modules into a unified Dual-CBA module, which thus is capable of adapting to catastrophic distribution shifts and simultaneously meets the real-time testing requirements of online CL. Besides, we propose Incremental Batch Normalization (IBN), a tailored BN module to re-estimate its population statistics for alleviating the feature bias arising from the inner loop optimization problem of our bi-level framework. To validate the effectiveness of the proposed method, we theoretically provide some insights into how it mitigates catastrophic distribution shifts, and empirically demonstrate its superiority through extensive experiments based on four rehearsal-based baselines and three public continual learning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13991v1-abstract-full').style.display = 'none'; document.getElementById('2408.13991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13896">arXiv:2408.13896</a> <span> [<a href="https://arxiv.org/pdf/2408.13896">pdf</a>, <a href="https://arxiv.org/format/2408.13896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RT-Attack: Jailbreaking Text-to-Image Models via Random Token </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+S">Sensen Gao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihao Huang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qing Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13896v2-abstract-short" style="display: inline;"> Recently, Text-to-Image(T2I) models have achieved remarkable success in image generation and editing, yet these models still have many potential issues, particularly in generating inappropriate or Not-Safe-For-Work(NSFW) content. Strengthening attacks and uncovering such vulnerabilities can advance the development of reliable and practical T2I models. Most of the previous works treat T2I models as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13896v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13896v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13896v2-abstract-full" style="display: none;"> Recently, Text-to-Image(T2I) models have achieved remarkable success in image generation and editing, yet these models still have many potential issues, particularly in generating inappropriate or Not-Safe-For-Work(NSFW) content. Strengthening attacks and uncovering such vulnerabilities can advance the development of reliable and practical T2I models. Most of the previous works treat T2I models as white-box systems, using gradient optimization to generate adversarial prompts. However, accessing the model's gradient is often impossible in real-world scenarios. Moreover, existing defense methods, those using gradient masking, are designed to prevent attackers from obtaining accurate gradient information. While some black-box jailbreak attacks have been explored, these typically rely on simply replacing sensitive words, leading to suboptimal attack performance. To address this issue, we introduce a two-stage query-based black-box attack method utilizing random search. In the first stage, we establish a preliminary prompt by maximizing the semantic similarity between the adversarial and target harmful prompts. In the second stage, we use this initial prompt to refine our approach, creating a detailed adversarial prompt aimed at jailbreaking and maximizing the similarity in image features between the images generated from this prompt and those produced by the target harmful prompt. Extensive experiments validate the effectiveness of our method in attacking the latest prompt checkers, post-hoc image checkers, securely trained T2I models, and online commercial models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13896v2-abstract-full').style.display = 'none'; document.getElementById('2408.13896v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10848">arXiv:2408.10848</a> <span> [<a href="https://arxiv.org/pdf/2408.10848">pdf</a>, <a href="https://arxiv.org/format/2408.10848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Perception-guided Jailbreak against Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihao Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Le Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianlin Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Run Wang</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+W">Weikai Miao</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+G">Geguang Pu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10848v2-abstract-short" style="display: inline;"> In recent years, Text-to-Image (T2I) models have garnered significant attention due to their remarkable advancements. However, security concerns have emerged due to their potential to generate inappropriate or Not-Safe-For-Work (NSFW) images. In this paper, inspired by the observation that texts with different semantics can lead to similar human perceptions, we propose an LLM-driven perception-gui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10848v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10848v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10848v2-abstract-full" style="display: none;"> In recent years, Text-to-Image (T2I) models have garnered significant attention due to their remarkable advancements. However, security concerns have emerged due to their potential to generate inappropriate or Not-Safe-For-Work (NSFW) images. In this paper, inspired by the observation that texts with different semantics can lead to similar human perceptions, we propose an LLM-driven perception-guided jailbreak method, termed PGJ. It is a black-box jailbreak method that requires no specific T2I model (model-free) and generates highly natural attack prompts. Specifically, we propose identifying a safe phrase that is similar in human perception yet inconsistent in text semantics with the target unsafe word and using it as a substitution. The experiments conducted on six open-source models and commercial online services with thousands of prompts have verified the effectiveness of PGJ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10848v2-abstract-full').style.display = 'none'; document.getElementById('2408.10848v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08586">arXiv:2408.08586</a> <span> [<a href="https://arxiv.org/pdf/2408.08586">pdf</a>, <a href="https://arxiv.org/format/2408.08586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Rubick: Exploiting Job Reconfigurability for Deep Learning Cluster Scheduling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wencong Xiao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xianyan Jia</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fei Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fangming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08586v1-abstract-short" style="display: inline;"> The era of large deep learning models has given rise to advanced training strategies such as 3D parallelism and the ZeRO series. These strategies enable various (re-)configurable execution plans for a training job, which exhibit remarkably different requirements of multiple resource types. Existing cluster scheduling systems, however, treat such reconfigurable training jobs as black boxes: they re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08586v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08586v1-abstract-full" style="display: none;"> The era of large deep learning models has given rise to advanced training strategies such as 3D parallelism and the ZeRO series. These strategies enable various (re-)configurable execution plans for a training job, which exhibit remarkably different requirements of multiple resource types. Existing cluster scheduling systems, however, treat such reconfigurable training jobs as black boxes: they rely on users to choose execution plans statically, and then make resource allocations without awareness of the chosen plans and their resource requirements. This approach results in mismatches between execution plans and resources, making both training performance and cluster utilization far from optimal. We introduce Rubick, a cluster scheduling system for deep learning training that exploits the reconfigurability to improve job performance and cluster efficiency. Rubick incorporates the job execution planning as a new dimension in cluster scheduling, by continuously reconfiguring jobs' execution plans and tuning multi-resource allocations across jobs jointly. Such a co-optimization is navigated by a performance model that understands the diverse resource requirements and performance characteristics of different jobs and execution plans. Rubick exploits such a model to make performance-aware scheduling decisions to maximize cluster throughput while providing performance guarantees to individual jobs. Evaluations on a 64-GPU high-performance training cluster show that Rubick improves average job completion time and makespan by up to 3.2x and 1.4x, respectively, compared against state-of-the-art systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08586v1-abstract-full').style.display = 'none'; document.getElementById('2408.08586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07009">arXiv:2408.07009</a> <span> [<a href="https://arxiv.org/pdf/2408.07009">pdf</a>, <a href="https://arxiv.org/format/2408.07009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imagen 3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imagen-Team-Google"> Imagen-Team-Google</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J">Jason Baldridge</a>, <a href="/search/cs?searchtype=author&query=Bauer%2C+J">Jakob Bauer</a>, <a href="/search/cs?searchtype=author&query=Bhutani%2C+M">Mukul Bhutani</a>, <a href="/search/cs?searchtype=author&query=Brichtova%2C+N">Nicole Brichtova</a>, <a href="/search/cs?searchtype=author&query=Bunner%2C+A">Andrew Bunner</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kelvin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yichang Chen</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuqing Du</a>, <a href="/search/cs?searchtype=author&query=Eaton-Rosen%2C+Z">Zach Eaton-Rosen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongliang Fei</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yilin Gao</a>, <a href="/search/cs?searchtype=author&query=Gladchenko%2C+E">Evgeny Gladchenko</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mandy Guo</a>, <a href="/search/cs?searchtype=author&query=Haig%2C+A">Alex Haig</a>, <a href="/search/cs?searchtype=author&query=Hawkins%2C+W">Will Hawkins</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huilian Huang</a>, <a href="/search/cs?searchtype=author&query=Igwe%2C+T+P">Tobenna Peter Igwe</a>, <a href="/search/cs?searchtype=author&query=Kaplanis%2C+C">Christos Kaplanis</a>, <a href="/search/cs?searchtype=author&query=Khodadadeh%2C+S">Siavash Khodadadeh</a> , et al. (227 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07009v1-abstract-short" style="display: inline;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07009v1-abstract-full" style="display: none;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07009v1-abstract-full').style.display = 'none'; document.getElementById('2408.07009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06832">arXiv:2408.06832</a> <span> [<a href="https://arxiv.org/pdf/2408.06832">pdf</a>, <a href="https://arxiv.org/format/2408.06832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlatFusion: Delving into Details of Sparse Transformer-based Camera-LiDAR Fusion for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yutao Zhu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaosong Jia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06832v1-abstract-short" style="display: inline;"> The integration of data from diverse sensor modalities (e.g., camera and LiDAR) constitutes a prevalent methodology within the ambit of autonomous driving scenarios. Recent advancements in efficient point cloud transformers have underscored the efficacy of integrating information in sparse formats. When it comes to fusion, since image patches are dense in pixel space with ambiguous depth, it neces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06832v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06832v1-abstract-full" style="display: none;"> The integration of data from diverse sensor modalities (e.g., camera and LiDAR) constitutes a prevalent methodology within the ambit of autonomous driving scenarios. Recent advancements in efficient point cloud transformers have underscored the efficacy of integrating information in sparse formats. When it comes to fusion, since image patches are dense in pixel space with ambiguous depth, it necessitates additional design considerations for effective fusion. In this paper, we conduct a comprehensive exploration of design choices for Transformer-based sparse cameraLiDAR fusion. This investigation encompasses strategies for image-to-3D and LiDAR-to-2D mapping, attention neighbor grouping, single modal tokenizer, and micro-structure of Transformer. By amalgamating the most effective principles uncovered through our investigation, we introduce FlatFusion, a carefully designed framework for sparse camera-LiDAR fusion. Notably, FlatFusion significantly outperforms state-of-the-art sparse Transformer-based methods, including UniTR, CMT, and SparseFusion, achieving 73.7 NDS on the nuScenes validation set with 10.1 FPS with PyTorch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06832v1-abstract-full').style.display = 'none'; document.getElementById('2408.06832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05956">arXiv:2408.05956</a> <span> [<a href="https://arxiv.org/pdf/2408.05956">pdf</a>, <a href="https://arxiv.org/format/2408.05956">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boosting Adverse Weather Crowd Counting via Multi-queue Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tianhang Pan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiuyi Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05956v2-abstract-short" style="display: inline;"> Currently, most crowd counting methods have outstanding performance under normal weather conditions. However, they often struggle to maintain their performance in extreme and adverse weather conditions due to significant differences in the domain and a lack of adverse weather images for training. To address this issue and enhance the model's robustness in adverse weather, we propose a two-stage cr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05956v2-abstract-full').style.display = 'inline'; document.getElementById('2408.05956v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05956v2-abstract-full" style="display: none;"> Currently, most crowd counting methods have outstanding performance under normal weather conditions. However, they often struggle to maintain their performance in extreme and adverse weather conditions due to significant differences in the domain and a lack of adverse weather images for training. To address this issue and enhance the model's robustness in adverse weather, we propose a two-stage crowd counting method. Specifically, in the first stage, we introduce a multi-queue MoCo contrastive learning strategy to tackle the problem of weather class imbalance. This strategy facilitates the learning of weather-aware representations by the model. In the second stage, we propose to refine the representations under the guidance of contrastive learning, enabling the conversion of the weather-aware representations to the normal weather domain. While significantly improving the robustness, our method only marginally increases the weight of the model. In addition, we also create a new synthetic adverse weather dataset. Extensive experimental results show that our method achieves competitive performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05956v2-abstract-full').style.display = 'none'; document.getElementById('2408.05956v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a> <span> [<a href="https://arxiv.org/pdf/2407.21783">pdf</a>, <a href="https://arxiv.org/format/2407.21783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Llama 3 Herd of Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grattafiori%2C+A">Aaron Grattafiori</a>, <a href="/search/cs?searchtype=author&query=Dubey%2C+A">Abhimanyu Dubey</a>, <a href="/search/cs?searchtype=author&query=Jauhri%2C+A">Abhinav Jauhri</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+A">Abhinav Pandey</a>, <a href="/search/cs?searchtype=author&query=Kadian%2C+A">Abhishek Kadian</a>, <a href="/search/cs?searchtype=author&query=Al-Dahle%2C+A">Ahmad Al-Dahle</a>, <a href="/search/cs?searchtype=author&query=Letman%2C+A">Aiesha Letman</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Akhil Mathur</a>, <a href="/search/cs?searchtype=author&query=Schelten%2C+A">Alan Schelten</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+A">Alex Vaughan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Amy Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Hartshorn%2C+A">Anthony Hartshorn</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Aobo Yang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+A">Archi Mitra</a>, <a href="/search/cs?searchtype=author&query=Sravankumar%2C+A">Archie Sravankumar</a>, <a href="/search/cs?searchtype=author&query=Korenev%2C+A">Artem Korenev</a>, <a href="/search/cs?searchtype=author&query=Hinsvark%2C+A">Arthur Hinsvark</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Arun Rao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aston Zhang</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+A">Aurelien Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Gregerson%2C+A">Austen Gregerson</a>, <a href="/search/cs?searchtype=author&query=Spataru%2C+A">Ava Spataru</a>, <a href="/search/cs?searchtype=author&query=Roziere%2C+B">Baptiste Roziere</a> , et al. (536 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21783v3-abstract-short" style="display: inline;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21783v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21783v3-abstract-full" style="display: none;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'none'; document.getElementById('2407.21783v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16307">arXiv:2407.16307</a> <span> [<a href="https://arxiv.org/pdf/2407.16307">pdf</a>, <a href="https://arxiv.org/format/2407.16307">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Unlearnable Examples: Protecting Data against Multimodal Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinwei Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Xun%2C+Y">Yuan Xun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16307v2-abstract-short" style="display: inline;"> Multimodal contrastive learning (MCL) has shown remarkable advances in zero-shot classification by learning from millions of image-caption pairs crawled from the Internet. However, this reliance poses privacy risks, as hackers may unauthorizedly exploit image-text data for model training, potentially including personal and privacy-sensitive information. Recent works propose generating unlearnable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16307v2-abstract-full').style.display = 'inline'; document.getElementById('2407.16307v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16307v2-abstract-full" style="display: none;"> Multimodal contrastive learning (MCL) has shown remarkable advances in zero-shot classification by learning from millions of image-caption pairs crawled from the Internet. However, this reliance poses privacy risks, as hackers may unauthorizedly exploit image-text data for model training, potentially including personal and privacy-sensitive information. Recent works propose generating unlearnable examples by adding imperceptible perturbations to training images to build shortcuts for protection. However, they are designed for unimodal classification, which remains largely unexplored in MCL. We first explore this context by evaluating the performance of existing methods on image-caption pairs, and they do not generalize effectively to multimodal data and exhibit limited impact to build shortcuts due to the lack of labels and the dispersion of pairs in MCL. In this paper, we propose Multi-step Error Minimization (MEM), a novel optimization process for generating multimodal unlearnable examples. It extends the Error-Minimization (EM) framework to optimize both image noise and an additional text trigger, thereby enlarging the optimized space and effectively misleading the model to learn the shortcut between the noise features and the text trigger. Specifically, we adopt projected gradient descent to solve the noise minimization problem and use HotFlip to approximate the gradient and replace words to find the optimal text trigger. Extensive experiments demonstrate the effectiveness of MEM, with post-protection retrieval results nearly half of random guessing, and its high transferability across different models. Our code is available on the https://github.com/thinwayliu/Multimodal-Unlearnable-Examples <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16307v2-abstract-full').style.display = 'none'; document.getElementById('2407.16307v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM MM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13426">arXiv:2407.13426</a> <span> [<a href="https://arxiv.org/pdf/2407.13426">pdf</a>, <a href="https://arxiv.org/format/2407.13426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WiNet: Wavelet-based Incremental Learning for Efficient Medical Image Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinxing Cheng</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xi Jia</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wenqi Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiufu Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Linlin Shen</a>, <a href="/search/cs?searchtype=author&query=Krull%2C+A">Alexander Krull</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jinming Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13426v1-abstract-short" style="display: inline;"> Deep image registration has demonstrated exceptional accuracy and fast inference. Recent advances have adopted either multiple cascades or pyramid architectures to estimate dense deformation fields in a coarse-to-fine manner. However, due to the cascaded nature and repeated composition/warping operations on feature maps, these methods negatively increase memory usage during training and testing. M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13426v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13426v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13426v1-abstract-full" style="display: none;"> Deep image registration has demonstrated exceptional accuracy and fast inference. Recent advances have adopted either multiple cascades or pyramid architectures to estimate dense deformation fields in a coarse-to-fine manner. However, due to the cascaded nature and repeated composition/warping operations on feature maps, these methods negatively increase memory usage during training and testing. Moreover, such approaches lack explicit constraints on the learning process of small deformations at different scales, thus lacking explainability. In this study, we introduce a model-driven WiNet that incrementally estimates scale-wise wavelet coefficients for the displacement/velocity field across various scales, utilizing the wavelet coefficients derived from the original input image pair. By exploiting the properties of the wavelet transform, these estimated coefficients facilitate the seamless reconstruction of a full-resolution displacement/velocity field via our devised inverse discrete wavelet transform (IDWT) layer. This approach avoids the complexities of cascading networks or composition operations, making our WiNet an explainable and efficient competitor with other coarse-to-fine methods. Extensive experimental results from two 3D datasets show that our WiNet is accurate and GPU efficient. The code is available at https://github.com/x-xc/WiNet . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13426v1-abstract-full').style.display = 'none'; document.getElementById('2407.13426v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12874">arXiv:2407.12874</a> <span> [<a href="https://arxiv.org/pdf/2407.12874">pdf</a>, <a href="https://arxiv.org/format/2407.12874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SELF-GUIDE: Better Task-Specific Instruction Following via Self-Synthetic Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xueying Jia</a>, <a href="/search/cs?searchtype=author&query=Viswanathan%2C+V">Vijay Viswanathan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tongshuang Wu</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12874v2-abstract-short" style="display: inline;"> Large language models (LLMs) hold the promise of solving diverse tasks when provided with appropriate natural language prompts. However, prompting often leads models to make predictions with lower accuracy compared to finetuning a model with ample training data. On the other hand, while finetuning LLMs on task-specific data generally improves their performance, abundant annotated datasets are not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12874v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12874v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12874v2-abstract-full" style="display: none;"> Large language models (LLMs) hold the promise of solving diverse tasks when provided with appropriate natural language prompts. However, prompting often leads models to make predictions with lower accuracy compared to finetuning a model with ample training data. On the other hand, while finetuning LLMs on task-specific data generally improves their performance, abundant annotated datasets are not available for all tasks. Previous work has explored generating task-specific data from state-of-the-art LLMs and using this data to finetune smaller models, but this approach requires access to a language model other than the one being trained, which introduces cost, scalability challenges, and legal hurdles associated with continuously relying on more powerful LLMs. In response to these, we propose SELF-GUIDE, a multi-stage mechanism in which we synthesize task-specific input-output pairs from the student LLM, then use these input-output pairs to finetune the student LLM itself. In our empirical evaluation of the Natural Instructions V2 benchmark, we find that SELF-GUIDE improves the performance of LLM by a substantial margin. Specifically, we report an absolute improvement of approximately 15% for classification tasks and 18% for generation tasks in the benchmark's metrics. This sheds light on the promise of self-synthesized data guiding LLMs towards becoming task-specific experts without any external learning signals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12874v2-abstract-full').style.display = 'none'; document.getElementById('2407.12874v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by COLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12593">arXiv:2407.12593</a> <span> [<a href="https://arxiv.org/pdf/2407.12593">pdf</a>, <a href="https://arxiv.org/format/2407.12593">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EvSign: Sign Language Recognition and Translation with Streaming Events </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hao Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeren Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyue Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shengming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12593v2-abstract-short" style="display: inline;"> Sign language is one of the most effective communication tools for people with hearing difficulties. Most existing works focus on improving the performance of sign language tasks on RGB videos, which may suffer from degraded recording conditions, such as fast movement of hands with motion blur and textured signer's appearance. The bio-inspired event camera, which asynchronously captures brightness… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12593v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12593v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12593v2-abstract-full" style="display: none;"> Sign language is one of the most effective communication tools for people with hearing difficulties. Most existing works focus on improving the performance of sign language tasks on RGB videos, which may suffer from degraded recording conditions, such as fast movement of hands with motion blur and textured signer's appearance. The bio-inspired event camera, which asynchronously captures brightness change with high speed, could naturally perceive dynamic hand movements, providing rich manual clues for sign language tasks. In this work, we aim at exploring the potential of event camera in continuous sign language recognition (CSLR) and sign language translation (SLT). To promote the research, we first collect an event-based benchmark EvSign for those tasks with both gloss and spoken language annotations. EvSign dataset offers a substantial amount of high-quality event streams and an extensive vocabulary of glosses and words, thereby facilitating the development of sign language tasks. In addition, we propose an efficient transformer-based framework for event-based SLR and SLT tasks, which fully leverages the advantages of streaming events. The sparse backbone is employed to extract visual features from sparse events. Then, the temporal coherence is effectively utilized through the proposed local token fusion and gloss-aware temporal aggregation modules. Extensive experimental results are reported on both simulated (PHOENIX14T) and EvSign datasets. Our method performs favorably against existing state-of-the-art approaches with only 0.34% computational cost (0.84G FLOPS per video) and 44.2% network parameters. The project is available at https://zhang-pengyu.github.io/EVSign. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12593v2-abstract-full').style.display = 'none'; document.getElementById('2407.12593v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear on ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12380">arXiv:2407.12380</a> <span> [<a href="https://arxiv.org/pdf/2407.12380">pdf</a>, <a href="https://arxiv.org/format/2407.12380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> PCQ: Emotion Recognition in Speech via Progressive Channel Querying </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xincheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liejun Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yinfeng Yu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xinxin Jiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12380v1-abstract-short" style="display: inline;"> In human-computer interaction (HCI), Speech Emotion Recognition (SER) is a key technology for understanding human intentions and emotions. Traditional SER methods struggle to effectively capture the long-term temporal correla-tions and dynamic variations in complex emotional expressions. To overcome these limitations, we introduce the PCQ method, a pioneering approach for SER via \textbf{P}rogress… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12380v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12380v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12380v1-abstract-full" style="display: none;"> In human-computer interaction (HCI), Speech Emotion Recognition (SER) is a key technology for understanding human intentions and emotions. Traditional SER methods struggle to effectively capture the long-term temporal correla-tions and dynamic variations in complex emotional expressions. To overcome these limitations, we introduce the PCQ method, a pioneering approach for SER via \textbf{P}rogressive \textbf{C}hannel \textbf{Q}uerying. This method can drill down layer by layer in the channel dimension through the channel query technique to achieve dynamic modeling of long-term contextual information of emotions. This mul-ti-level analysis gives the PCQ method an edge in capturing the nuances of hu-man emotions. Experimental results show that our model improves the weighted average (WA) accuracy by 3.98\% and 3.45\% and the unweighted av-erage (UA) accuracy by 5.67\% and 5.83\% on the IEMOCAP and EMODB emotion recognition datasets, respectively, significantly exceeding the baseline levels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12380v1-abstract-full').style.display = 'none'; document.getElementById('2407.12380v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication by International Conference On Intelligent Computing 2024. For data and code, see <a href="https://github.com/ICIG/PCQ-Net">this https URL</a></span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08481">arXiv:2407.08481</a> <span> [<a href="https://arxiv.org/pdf/2407.08481">pdf</a>, <a href="https://arxiv.org/format/2407.08481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SliceMamba with Neural Architecture Search for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+C">Chao Fan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenghan Yang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xibin Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08481v2-abstract-short" style="display: inline;"> Despite the progress made in Mamba-based medical image segmentation models, existing methods utilizing unidirectional or multi-directional feature scanning mechanisms struggle to effectively capture dependencies between neighboring positions, limiting the discriminant representation learning of local features. These local features are crucial for medical image segmentation as they provide critical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08481v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08481v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08481v2-abstract-full" style="display: none;"> Despite the progress made in Mamba-based medical image segmentation models, existing methods utilizing unidirectional or multi-directional feature scanning mechanisms struggle to effectively capture dependencies between neighboring positions, limiting the discriminant representation learning of local features. These local features are crucial for medical image segmentation as they provide critical structural information about lesions and organs. To address this limitation, we propose SliceMamba, a simple and effective locally sensitive Mamba-based medical image segmentation model. SliceMamba includes an efficient Bidirectional Slice Scan module (BSS), which performs bidirectional feature slicing and employs varied scanning mechanisms for sliced features with distinct shapes. This design ensures that spatially adjacent features remain close in the scanning sequence, thereby improving segmentation performance. Additionally, to fit the varying sizes and shapes of lesions and organs, we further introduce an Adaptive Slice Search method to automatically determine the optimal feature slice method based on the characteristics of the target data. Extensive experiments on two skin lesion datasets (ISIC2017 and ISIC2018), two polyp segmentation (Kvasir and ClinicDB) datasets, and one multi-organ segmentation dataset (Synapse) validate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08481v2-abstract-full').style.display = 'none'; document.getElementById('2407.08481v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07523">arXiv:2407.07523</a> <span> [<a href="https://arxiv.org/pdf/2407.07523">pdf</a>, <a href="https://arxiv.org/format/2407.07523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SHERL: Synthesizing High Accuracy and Efficient Memory for Resource-Limited Transfer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+B">Bo Wan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Zhuge%2C+Y">Yunzhi Zhuge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07523v1-abstract-short" style="display: inline;"> Parameter-efficient transfer learning (PETL) has emerged as a flourishing research field for adapting large pre-trained models to downstream tasks, greatly reducing trainable parameters while grappling with memory challenges during fine-tuning. To address it, memory-efficient series (METL) avoid backpropagating gradients through the large backbone. However, they compromise by exclusively relying o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07523v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07523v1-abstract-full" style="display: none;"> Parameter-efficient transfer learning (PETL) has emerged as a flourishing research field for adapting large pre-trained models to downstream tasks, greatly reducing trainable parameters while grappling with memory challenges during fine-tuning. To address it, memory-efficient series (METL) avoid backpropagating gradients through the large backbone. However, they compromise by exclusively relying on frozen intermediate outputs and limiting the exhaustive exploration of prior knowledge from pre-trained models. Moreover, the dependency and redundancy between cross-layer features are frequently overlooked, thereby submerging more discriminative representations and causing an inherent performance gap (vs. conventional PETL methods). Hence, we propose an innovative METL strategy called SHERL for resource-limited scenarios to decouple the entire adaptation into two successive and complementary processes. In the early route, intermediate outputs are consolidated via an anti-redundancy operation, enhancing their compatibility for subsequent interactions; thereby in the late route, utilizing minimal late pre-trained layers could alleviate the peak demand on memory overhead and regulate these fairly flexible features into more adaptive and powerful representations for new domains. Extensive ablations on vision-and-language and language-only tasks show that SHERL combines the strengths of both parameter and memory-efficient techniques, performing on-par or better across diverse architectures with lower memory during fine-tuning. Our code is publicly available at: https://github.com/Paranioar/SHERL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07523v1-abstract-full').style.display = 'none'; document.getElementById('2407.07523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 11 figures, Accepted by ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05463">arXiv:2407.05463</a> <span> [<a href="https://arxiv.org/pdf/2407.05463">pdf</a>, <a href="https://arxiv.org/format/2407.05463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training Task Experts through Retrieval Based Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiaxin Ge</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xueying Jia</a>, <a href="/search/cs?searchtype=author&query=Viswanathan%2C+V">Vijay Viswanathan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hongyin Luo</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05463v1-abstract-short" style="display: inline;"> One of the most reliable ways to create deployable models for specialized tasks is to obtain an adequate amount of high-quality task-specific data. However, for specialized tasks, often such datasets do not exist. Existing methods address this by creating such data from large language models (LLMs) and then distilling such knowledge into smaller models. However, these methods are limited by the qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05463v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05463v1-abstract-full" style="display: none;"> One of the most reliable ways to create deployable models for specialized tasks is to obtain an adequate amount of high-quality task-specific data. However, for specialized tasks, often such datasets do not exist. Existing methods address this by creating such data from large language models (LLMs) and then distilling such knowledge into smaller models. However, these methods are limited by the quality of the LLMs output, and tend to generate repetitive or incorrect data. In this work, we present Retrieval Based Distillation (ReBase), a method that first retrieves data from rich online sources and then transforms them into domain-specific data. This method greatly enhances data diversity. Moreover, ReBase generates Chain-of-Thought reasoning and distills the reasoning capacity of LLMs. We test our method on 4 benchmarks and results show that our method significantly improves performance by up to 7.8% on SQuAD, 1.37% on MNLI, and 1.94% on BigBench-Hard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05463v1-abstract-full').style.display = 'none'; document.getElementById('2407.05463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03566">arXiv:2407.03566</a> <span> [<a href="https://arxiv.org/pdf/2407.03566">pdf</a>, <a href="https://arxiv.org/ps/2407.03566">ps</a>, <a href="https://arxiv.org/format/2407.03566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Stacked Intelligent Metasurfaces for Wireless Sensing and Communication: Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=An%2C+J">Jiancheng An</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xing Jia</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shining Lin</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xianghao Yao</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+L">Lu Gan</a>, <a href="/search/cs?searchtype=author&query=Clerckx%2C+B">Bruno Clerckx</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Bennis%2C+M">Mehdi Bennis</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">M茅rouane Debbah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03566v1-abstract-short" style="display: inline;"> The rapid advancement of wireless communication technologies has precipitated an unprecedented demand for high data rates, extremely low latency, and ubiquitous connectivity. In order to achieve these goals, stacked intelligent metasurfaces (SIM) has been developed as a novel solution to perform advanced signal processing tasks directly in the electromagnetic wave domain, thus achieving ultra-fast… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03566v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03566v1-abstract-full" style="display: none;"> The rapid advancement of wireless communication technologies has precipitated an unprecedented demand for high data rates, extremely low latency, and ubiquitous connectivity. In order to achieve these goals, stacked intelligent metasurfaces (SIM) has been developed as a novel solution to perform advanced signal processing tasks directly in the electromagnetic wave domain, thus achieving ultra-fast computing speed and reducing hardware complexity. This article provides an overview of the SIM technology by discussing its hardware architectures, advantages, and potential applications for wireless sensing and communication. Specifically, we explore the utilization of SIMs in enabling wave-domain beamforming, channel modeling and estimation in SIM-assisted communication systems. Furthermore, we elaborate on the potential of utilizing a SIM to build a hybrid optical-electronic neural network (HOENN) and demonstrate its efficacy by examining two case studies: disaster monitoring and direction-of-arrival estimation. Finally, we identify key implementation challenges, including practical hardware imperfections, efficient SIM configuration for realizing wave-domain signal processing, and performance analysis to motivate future research on this important and far-reaching topic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03566v1-abstract-full').style.display = 'none'; document.getElementById('2407.03566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00955">arXiv:2407.00955</a> <span> [<a href="https://arxiv.org/pdf/2407.00955">pdf</a>, <a href="https://arxiv.org/format/2407.00955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Task-oriented Over-the-air Computation for Edge-device Co-inference with Balanced Classification Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xiang Jiao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dingzhu Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wu Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanming Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00955v1-abstract-short" style="display: inline;"> Edge-device co-inference, which concerns the cooperation between edge devices and an edge server for completing inference tasks over wireless networks, has been a promising technique for enabling various kinds of intelligent services at the network edge, e.g., auto-driving. In this paradigm, the concerned design objective of the network shifts from the traditional communication throughput to the e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00955v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00955v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00955v1-abstract-full" style="display: none;"> Edge-device co-inference, which concerns the cooperation between edge devices and an edge server for completing inference tasks over wireless networks, has been a promising technique for enabling various kinds of intelligent services at the network edge, e.g., auto-driving. In this paradigm, the concerned design objective of the network shifts from the traditional communication throughput to the effective and efficient execution of the inference task underpinned by the network, measured by, e.g., the inference accuracy and latency. In this paper, a task-oriented over-the-air computation scheme is proposed for a multidevice artificial intelligence system. Particularly, a novel tractable inference accuracy metric is proposed for classification tasks, which is called minimum pair-wise discriminant gain. Unlike prior work measuring the average of all class pairs in feature space, it measures the minimum distance of all class pairs. By maximizing the minimum pair-wise discriminant gain instead of its average counterpart, any pair of classes can be better separated in the feature space, and thus leading to a balanced and improved inference accuracy for all classes. Besides, this paper jointly optimizes the minimum discriminant gain of all feature elements instead of separately maximizing that of each element in the existing designs. As a result, the transmit power can be adaptively allocated to the feature elements according to their different contributions to the inference accuracy, opening an extra degree of freedom to improve inference performance. Extensive experiments are conducted using a concrete use case of human motion recognition to verify the superiority of the proposed design over the benchmarking scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00955v1-abstract-full').style.display = 'none'; document.getElementById('2407.00955v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted by IEEE Transactions on Vehicular Technology on June 30, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12538">arXiv:2406.12538</a> <span> [<a href="https://arxiv.org/pdf/2406.12538">pdf</a>, <a href="https://arxiv.org/format/2406.12538">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Variational Distillation of Diffusion Policies into Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Blessing%2C+D">Denis Blessing</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Celik%2C+O">Onur Celik</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaogang Jia</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a>, <a href="/search/cs?searchtype=author&query=Lioutikov%2C+R">Rudolf Lioutikov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12538v2-abstract-short" style="display: inline;"> This work introduces Variational Diffusion Distillation (VDD), a novel method that distills denoising diffusion policies into Mixtures of Experts (MoE) through variational inference. Diffusion Models are the current state-of-the-art in generative modeling due to their exceptional ability to accurately learn and represent complex, multi-modal distributions. This ability allows Diffusion Models to r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12538v2-abstract-full').style.display = 'inline'; document.getElementById('2406.12538v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12538v2-abstract-full" style="display: none;"> This work introduces Variational Diffusion Distillation (VDD), a novel method that distills denoising diffusion policies into Mixtures of Experts (MoE) through variational inference. Diffusion Models are the current state-of-the-art in generative modeling due to their exceptional ability to accurately learn and represent complex, multi-modal distributions. This ability allows Diffusion Models to replicate the inherent diversity in human behavior, making them the preferred models in behavior learning such as Learning from Human Demonstrations (LfD). However, diffusion models come with some drawbacks, including the intractability of likelihoods and long inference times due to their iterative sampling process. The inference times, in particular, pose a significant challenge to real-time applications such as robot control. In contrast, MoEs effectively address the aforementioned issues while retaining the ability to represent complex distributions but are notoriously difficult to train. VDD is the first method that distills pre-trained diffusion models into MoE models, and hence, combines the expressiveness of Diffusion Models with the benefits of Mixture Models. Specifically, VDD leverages a decompositional upper bound of the variational objective that allows the training of each expert separately, resulting in a robust optimization scheme for MoEs. VDD demonstrates across nine complex behavior learning tasks, that it is able to: i) accurately distill complex distributions learned by the diffusion model, ii) outperform existing state-of-the-art distillation methods, and iii) surpass conventional methods for training MoE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12538v2-abstract-full').style.display = 'none'; document.getElementById('2406.12538v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Annual Conference on Neural Information Processing Systems,</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08234">arXiv:2406.08234</a> <span> [<a href="https://arxiv.org/pdf/2406.08234">pdf</a>, <a href="https://arxiv.org/format/2406.08234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MaIL: Improving Imitation Learning with Mamba </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaogang Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Donat%2C+A">Atalay Donat</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+B">Bowen Xing</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Celik%2C+O">Onur Celik</a>, <a href="/search/cs?searchtype=author&query=Blessing%2C+D">Denis Blessing</a>, <a href="/search/cs?searchtype=author&query=Lioutikov%2C+R">Rudolf Lioutikov</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08234v2-abstract-short" style="display: inline;"> This work presents Mamba Imitation Learning (MaIL), a novel imitation learning (IL) architecture that provides an alternative to state-of-the-art (SoTA) Transformer-based policies. MaIL leverages Mamba, a state-space model designed to selectively focus on key features of the data. While Transformers are highly effective in data-rich environments due to their dense attention mechanisms, they can st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08234v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08234v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08234v2-abstract-full" style="display: none;"> This work presents Mamba Imitation Learning (MaIL), a novel imitation learning (IL) architecture that provides an alternative to state-of-the-art (SoTA) Transformer-based policies. MaIL leverages Mamba, a state-space model designed to selectively focus on key features of the data. While Transformers are highly effective in data-rich environments due to their dense attention mechanisms, they can struggle with smaller datasets, often leading to overfitting or suboptimal representation learning. In contrast, Mamba's architecture enhances representation learning efficiency by focusing on key features and reducing model complexity. This approach mitigates overfitting and enhances generalization, even when working with limited data. Extensive evaluations on the LIBERO benchmark demonstrate that MaIL consistently outperforms Transformers on all LIBERO tasks with limited data and matches their performance when the full dataset is available. Additionally, MaIL's effectiveness is validated through its superior performance in three real robot experiments. Our code is available at https://github.com/ALRhub/MaIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08234v2-abstract-full').style.display = 'none'; document.getElementById('2406.08234v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07423">arXiv:2406.07423</a> <span> [<a href="https://arxiv.org/pdf/2406.07423">pdf</a>, <a href="https://arxiv.org/format/2406.07423">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Beyond ELBOs: A Large-Scale Evaluation of Variational Methods for Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Blessing%2C+D">Denis Blessing</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaogang Jia</a>, <a href="/search/cs?searchtype=author&query=Esslinger%2C+J">Johannes Esslinger</a>, <a href="/search/cs?searchtype=author&query=Vargas%2C+F">Francisco Vargas</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07423v1-abstract-short" style="display: inline;"> Monte Carlo methods, Variational Inference, and their combinations play a pivotal role in sampling from intractable probability distributions. However, current studies lack a unified evaluation framework, relying on disparate performance measures and limited method comparisons across diverse tasks, complicating the assessment of progress and hindering the decision-making of practitioners. In respo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07423v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07423v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07423v1-abstract-full" style="display: none;"> Monte Carlo methods, Variational Inference, and their combinations play a pivotal role in sampling from intractable probability distributions. However, current studies lack a unified evaluation framework, relying on disparate performance measures and limited method comparisons across diverse tasks, complicating the assessment of progress and hindering the decision-making of practitioners. In response to these challenges, our work introduces a benchmark that evaluates sampling methods using a standardized task suite and a broad range of performance criteria. Moreover, we study existing metrics for quantifying mode collapse and introduce novel metrics for this purpose. Our findings provide insights into strengths and weaknesses of existing sampling methods, serving as a valuable reference for future developments. The code is publicly available here. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07423v1-abstract-full').style.display = 'none'; document.getElementById('2406.07423v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06089">arXiv:2406.06089</a> <span> [<a href="https://arxiv.org/pdf/2406.06089">pdf</a>, <a href="https://arxiv.org/format/2406.06089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Texture Re-scalable Universal Adversarial Perturbation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihao Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Ming Hu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+G">Geguang Pu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06089v1-abstract-short" style="display: inline;"> Universal adversarial perturbation (UAP), also known as image-agnostic perturbation, is a fixed perturbation map that can fool the classifier with high probabilities on arbitrary images, making it more practical for attacking deep models in the real world. Previous UAP methods generate a scale-fixed and texture-fixed perturbation map for all images, which ignores the multi-scale objects in images… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06089v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06089v1-abstract-full" style="display: none;"> Universal adversarial perturbation (UAP), also known as image-agnostic perturbation, is a fixed perturbation map that can fool the classifier with high probabilities on arbitrary images, making it more practical for attacking deep models in the real world. Previous UAP methods generate a scale-fixed and texture-fixed perturbation map for all images, which ignores the multi-scale objects in images and usually results in a low fooling ratio. Since the widely used convolution neural networks tend to classify objects according to semantic information stored in local textures, it seems a reasonable and intuitive way to improve the UAP from the perspective of utilizing local contents effectively. In this work, we find that the fooling ratios significantly increase when we add a constraint to encourage a small-scale UAP map and repeat it vertically and horizontally to fill the whole image domain. To this end, we propose texture scale-constrained UAP (TSC-UAP), a simple yet effective UAP enhancement method that automatically generates UAPs with category-specific local textures that can fool deep models more easily. Through a low-cost operation that restricts the texture scale, TSC-UAP achieves a considerable improvement in the fooling ratio and attack transferability for both data-dependent and data-free UAP methods. Experiments conducted on two state-of-the-art UAP methods, eight popular CNN models and four classical datasets show the remarkable performance of TSC-UAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06089v1-abstract-full').style.display = 'none'; document.getElementById('2406.06089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages (accepted by TIFS2024)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jia%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jia%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository