Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 6,584 results for author: <span class="mathjax">Zhang, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19772">arXiv:2411.19772</a> <span> [<a href="https://arxiv.org/pdf/2411.19772">pdf</a>, <a href="https://arxiv.org/format/2411.19772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> LongVALE: Vision-Audio-Language-Event Benchmark Towards Time-Aware Omni-Modal Perception of Long Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geng%2C+T">Tiantian Geng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingni Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Teng Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jinming Duan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+F">Feng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19772v1-abstract-short" style="display: inline;"> Despite impressive advancements in video understanding, most efforts remain limited to coarse-grained or visual-only video tasks. However, real-world videos encompass omni-modal information (vision, audio, and speech) with a series of events forming a cohesive storyline. The lack of multi-modal video data with fine-grained event annotations and the high cost of manual labeling are major obstacles… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19772v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19772v1-abstract-full" style="display: none;"> Despite impressive advancements in video understanding, most efforts remain limited to coarse-grained or visual-only video tasks. However, real-world videos encompass omni-modal information (vision, audio, and speech) with a series of events forming a cohesive storyline. The lack of multi-modal video data with fine-grained event annotations and the high cost of manual labeling are major obstacles to comprehensive omni-modality video perception. To address this gap, we propose an automatic pipeline consisting of high-quality multi-modal video filtering, semantically coherent omni-modal event boundary detection, and cross-modal correlation-aware event captioning. In this way, we present LongVALE, the first-ever Vision-Audio-Language Event understanding benchmark comprising 105K omni-modal events with precise temporal boundaries and detailed relation-aware captions within 8.4K high-quality long videos. Further, we build a baseline that leverages LongVALE to enable video large language models (LLMs) for omni-modality fine-grained temporal video understanding for the first time. Extensive experiments demonstrate the effectiveness and great potential of LongVALE in advancing comprehensive multi-modal video understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19772v1-abstract-full').style.display = 'none'; document.getElementById('2411.19772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19668">arXiv:2411.19668</a> <span> [<a href="https://arxiv.org/pdf/2411.19668">pdf</a>, <a href="https://arxiv.org/format/2411.19668">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ChineseWebText 2.0: Large-Scale High-quality Chinese Web Text with Multi-dimensional and fine-grained information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+C">Chunlin Leng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yinan Bai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qianlong Du</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+C">Chengqing Zong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19668v1-abstract-short" style="display: inline;"> During the development of large language models (LLMs), pre-training data play a critical role in shaping LLMs' capabilities. In recent years several large-scale and high-quality pre-training datasets have been released to accelerate the research of LLMs, including ChineseWebText1.0, C4, Pile, WanJuan, MAPCC and others. However, as LLMs continue to evolve, focus has increasingly shifted to domain-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19668v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19668v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19668v1-abstract-full" style="display: none;"> During the development of large language models (LLMs), pre-training data play a critical role in shaping LLMs' capabilities. In recent years several large-scale and high-quality pre-training datasets have been released to accelerate the research of LLMs, including ChineseWebText1.0, C4, Pile, WanJuan, MAPCC and others. However, as LLMs continue to evolve, focus has increasingly shifted to domain-specific capabilities and safety concerns, making those previous coarse-grained texts insufficient for meeting training requirements. Furthermore, fine-grained information, such as quality, domain and toxicity, is becoming increasingly important in building powerful and reliable LLMs for various scenarios. To address these challenges, in this paper we propose a new tool-chain called MDFG-tool for constructing large-scale and high-quality Chinese datasets with multi-dimensional and fine-grained information. First, we employ manually crafted rules to discard explicit noisy texts from raw contents. Second, the quality evaluation model, domain classifier, and toxicity evaluation model are well-designed to assess the remaining cleaned data respectively. Finally, we integrate these three types of fine-grained information for each text. With this approach, we release the largest, high-quality and fine-grained Chinese text ChineseWebText2.0, which consists of 3.8TB and each text is associated with a quality score, domain labels, a toxicity label and a toxicity score, facilitating the LLM researchers to select data based on various types of fine-grained information. The data, codes and the tool-chain are available on this website https://github.com/CASIA-LM/ChineseWebText-2.0 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19668v1-abstract-full').style.display = 'none'; document.getElementById('2411.19668v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ChineseWebTex2.0 dataset is available at https://github.com/CASIA-LM/ChineseWebText-2.0</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19289">arXiv:2411.19289</a> <span> [<a href="https://arxiv.org/pdf/2411.19289">pdf</a>, <a href="https://arxiv.org/format/2411.19289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GMS-VINS:Multi-category Dynamic Objects Semantic Segmentation for Enhanced Visual-Inertial Odometry Using a Promptable Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Rui Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingbin Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Junbin Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yingze Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiele Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19289v1-abstract-short" style="display: inline;"> Visual-inertial odometry (VIO) is widely used in various fields, such as robots, drones, and autonomous vehicles, due to its low cost and complementary sensors. Most VIO methods presuppose that observed objects are static and time-invariant. However, real-world scenes often feature dynamic objects, compromising the accuracy of pose estimation. These moving entities include cars, trucks, buses, mot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19289v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19289v1-abstract-full" style="display: none;"> Visual-inertial odometry (VIO) is widely used in various fields, such as robots, drones, and autonomous vehicles, due to its low cost and complementary sensors. Most VIO methods presuppose that observed objects are static and time-invariant. However, real-world scenes often feature dynamic objects, compromising the accuracy of pose estimation. These moving entities include cars, trucks, buses, motorcycles, and pedestrians. The diversity and partial occlusion of these objects present a tough challenge for existing dynamic object removal techniques. To tackle this challenge, we introduce GMS-VINS, which integrates an enhanced SORT algorithm along with a robust multi-category segmentation framework into VIO, thereby improving pose estimation accuracy in environments with diverse dynamic objects and frequent occlusions. Leveraging the promptable foundation model, our solution efficiently tracks and segments a wide range of object categories. The enhanced SORT algorithm significantly improves the reliability of tracking multiple dynamic objects, especially in urban settings with partial occlusions or swift movements. We evaluated our proposed method using multiple public datasets representing various scenes, as well as in a real-world scenario involving diverse dynamic objects. The experimental results demonstrate that our proposed method performs impressively in multiple scenarios, outperforming other state-of-the-art methods. This highlights its remarkable generalization and adaptability in diverse dynamic environments, showcasing its potential to handle various dynamic objects in practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19289v1-abstract-full').style.display = 'none'; document.getElementById('2411.19289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19235">arXiv:2411.19235</a> <span> [<a href="https://arxiv.org/pdf/2411.19235">pdf</a>, <a href="https://arxiv.org/format/2411.19235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InstanceGaussian: Appearance-Semantic Joint Gaussian Representation for 3D Instance-Level Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haijie Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanmin Wu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+J">Jiarui Meng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiankun Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19235v1-abstract-short" style="display: inline;"> 3D scene understanding has become an essential area of research with applications in autonomous driving, robotics, and augmented reality. Recently, 3D Gaussian Splatting (3DGS) has emerged as a powerful approach, combining explicit modeling with neural adaptability to provide efficient and detailed scene representations. However, three major challenges remain in leveraging 3DGS for scene understan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19235v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19235v1-abstract-full" style="display: none;"> 3D scene understanding has become an essential area of research with applications in autonomous driving, robotics, and augmented reality. Recently, 3D Gaussian Splatting (3DGS) has emerged as a powerful approach, combining explicit modeling with neural adaptability to provide efficient and detailed scene representations. However, three major challenges remain in leveraging 3DGS for scene understanding: 1) an imbalance between appearance and semantics, where dense Gaussian usage for fine-grained texture modeling does not align with the minimal requirements for semantic attributes; 2) inconsistencies between appearance and semantics, as purely appearance-based Gaussians often misrepresent object boundaries; and 3) reliance on top-down instance segmentation methods, which struggle with uneven category distributions, leading to over- or under-segmentation. In this work, we propose InstanceGaussian, a method that jointly learns appearance and semantic features while adaptively aggregating instances. Our contributions include: i) a novel Semantic-Scaffold-GS representation balancing appearance and semantics to improve feature representations and boundary delineation; ii) a progressive appearance-semantic joint training strategy to enhance stability and segmentation accuracy; and iii) a bottom-up, category-agnostic instance aggregation approach that addresses segmentation challenges through farthest point sampling and connected component analysis. Our approach achieves state-of-the-art performance in category-agnostic, open-vocabulary 3D point-level segmentation, highlighting the effectiveness of the proposed representation and training strategies. Project page: https://lhj-git.github.io/InstanceGaussian/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19235v1-abstract-full').style.display = 'none'; document.getElementById('2411.19235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report, 13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18956">arXiv:2411.18956</a> <span> [<a href="https://arxiv.org/pdf/2411.18956">pdf</a>, <a href="https://arxiv.org/format/2411.18956">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Random Sampling for Diffusion-based Adversarial Purification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiancheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peiran Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongyong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yin-Ping Zhao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18956v1-abstract-short" style="display: inline;"> Denoising Diffusion Probabilistic Models (DDPMs) have gained great attention in adversarial purification. Current diffusion-based works focus on designing effective condition-guided mechanisms while ignoring a fundamental problem, i.e., the original DDPM sampling is intended for stable generation, which may not be the optimal solution for adversarial purification. Inspired by the stability of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18956v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18956v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18956v1-abstract-full" style="display: none;"> Denoising Diffusion Probabilistic Models (DDPMs) have gained great attention in adversarial purification. Current diffusion-based works focus on designing effective condition-guided mechanisms while ignoring a fundamental problem, i.e., the original DDPM sampling is intended for stable generation, which may not be the optimal solution for adversarial purification. Inspired by the stability of the Denoising Diffusion Implicit Model (DDIM), we propose an opposite sampling scheme called random sampling. In brief, random sampling will sample from a random noisy space during each diffusion process, while DDPM and DDIM sampling will continuously sample from the adjacent or original noisy space. Thus, random sampling obtains more randomness and achieves stronger robustness against adversarial attacks. Correspondingly, we also introduce a novel mediator conditional guidance to guarantee the consistency of the prediction under the purified image and clean image input. To expand awareness of guided diffusion purification, we conduct a detailed evaluation with different sampling methods and our random sampling achieves an impressive improvement in multiple settings. Leveraging mediator-guided random sampling, we also establish a baseline method named DiffAP, which significantly outperforms state-of-the-art (SOTA) approaches in performance and defensive stability. Remarkably, under strong attack, our DiffAP even achieves a more than 20% robustness advantage with 10$\times$ sampling acceleration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18956v1-abstract-full').style.display = 'none'; document.getElementById('2411.18956v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18823">arXiv:2411.18823</a> <span> [<a href="https://arxiv.org/pdf/2411.18823">pdf</a>, <a href="https://arxiv.org/format/2411.18823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Task Label Discovery via Hierarchical Task Tokens for Partially Annotated Dense Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hanrong Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18823v1-abstract-short" style="display: inline;"> In recent years, simultaneous learning of multiple dense prediction tasks with partially annotated label data has emerged as an important research area. Previous works primarily focus on constructing cross-task consistency or conducting adversarial training to regularize cross-task predictions, which achieve promising performance improvements, while still suffering from the lack of direct pixel-wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18823v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18823v1-abstract-full" style="display: none;"> In recent years, simultaneous learning of multiple dense prediction tasks with partially annotated label data has emerged as an important research area. Previous works primarily focus on constructing cross-task consistency or conducting adversarial training to regularize cross-task predictions, which achieve promising performance improvements, while still suffering from the lack of direct pixel-wise supervision for multi-task dense predictions. To tackle this challenge, we propose a novel approach to optimize a set of learnable hierarchical task tokens, including global and fine-grained ones, to discover consistent pixel-wise supervision signals in both feature and prediction levels. Specifically, the global task tokens are designed for effective cross-task feature interactions in a global context. Then, a group of fine-grained task-specific spatial tokens for each task is learned from the corresponding global task tokens. It is embedded to have dense interactions with each task-specific feature map. The learned global and local fine-grained task tokens are further used to discover pseudo task-specific dense labels at different levels of granularity, and they can be utilized to directly supervise the learning of the multi-task dense prediction framework. Extensive experimental results on challenging NYUD-v2, Cityscapes, and PASCAL Context datasets demonstrate significant improvements over existing state-of-the-art methods for partially annotated multi-task dense prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18823v1-abstract-full').style.display = 'none'; document.getElementById('2411.18823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18752">arXiv:2411.18752</a> <span> [<a href="https://arxiv.org/pdf/2411.18752">pdf</a>, <a href="https://arxiv.org/format/2411.18752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Locally Differentially Private Online Federated Learning With Correlated Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaojiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Linglingzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Fay%2C+D">Dominik Fay</a>, <a href="/search/cs?searchtype=author&query=Johansson%2C+M">Mikael Johansson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18752v1-abstract-short" style="display: inline;"> We introduce a locally differentially private (LDP) algorithm for online federated learning that employs temporally correlated noise to improve utility while preserving privacy. To address challenges posed by the correlated noise and local updates with streaming non-IID data, we develop a perturbed iterate analysis that controls the impact of the noise on the utility. Moreover, we demonstrate how… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18752v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18752v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18752v1-abstract-full" style="display: none;"> We introduce a locally differentially private (LDP) algorithm for online federated learning that employs temporally correlated noise to improve utility while preserving privacy. To address challenges posed by the correlated noise and local updates with streaming non-IID data, we develop a perturbed iterate analysis that controls the impact of the noise on the utility. Moreover, we demonstrate how the drift errors from local updates can be effectively managed for several classes of nonconvex loss functions. Subject to an $(蔚,未)$-LDP budget, we establish a dynamic regret bound that quantifies the impact of key parameters and the intensity of changes in the dynamic environment on the learning performance. Numerical experiments confirm the efficacy of the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18752v1-abstract-full').style.display = 'none'; document.getElementById('2411.18752v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2403.16542</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18624">arXiv:2411.18624</a> <span> [<a href="https://arxiv.org/pdf/2411.18624">pdf</a>, <a href="https://arxiv.org/format/2411.18624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeneMAN: Generalizable Single-Image 3D Human Reconstruction from Multi-Source Human Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wentao Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hang Ye</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fangzhou Hong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianfu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18624v1-abstract-short" style="display: inline;"> Given a single in-the-wild human photo, it remains a challenging task to reconstruct a high-fidelity 3D human model. Existing methods face difficulties including a) the varying body proportions captured by in-the-wild human images; b) diverse personal belongings within the shot; and c) ambiguities in human postures and inconsistency in human textures. In addition, the scarcity of high-quality huma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18624v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18624v1-abstract-full" style="display: none;"> Given a single in-the-wild human photo, it remains a challenging task to reconstruct a high-fidelity 3D human model. Existing methods face difficulties including a) the varying body proportions captured by in-the-wild human images; b) diverse personal belongings within the shot; and c) ambiguities in human postures and inconsistency in human textures. In addition, the scarcity of high-quality human data intensifies the challenge. To address these problems, we propose a Generalizable image-to-3D huMAN reconstruction framework, dubbed GeneMAN, building upon a comprehensive multi-source collection of high-quality human data, including 3D scans, multi-view videos, single photos, and our generated synthetic human data. GeneMAN encompasses three key modules. 1) Without relying on parametric human models (e.g., SMPL), GeneMAN first trains a human-specific text-to-image diffusion model and a view-conditioned diffusion model, serving as GeneMAN 2D human prior and 3D human prior for reconstruction, respectively. 2) With the help of the pretrained human prior models, the Geometry Initialization-&-Sculpting pipeline is leveraged to recover high-quality 3D human geometry given a single image. 3) To achieve high-fidelity 3D human textures, GeneMAN employs the Multi-Space Texture Refinement pipeline, consecutively refining textures in the latent and the pixel spaces. Extensive experimental results demonstrate that GeneMAN could generate high-quality 3D human models from a single image input, outperforming prior state-of-the-art methods. Notably, GeneMAN could reveal much better generalizability in dealing with in-the-wild images, often yielding high-quality 3D human models in natural poses with common items, regardless of the body proportions in the input images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18624v1-abstract-full').style.display = 'none'; document.getElementById('2411.18624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://roooooz.github.io/GeneMAN/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18328">arXiv:2411.18328</a> <span> [<a href="https://arxiv.org/pdf/2411.18328">pdf</a>, <a href="https://arxiv.org/format/2411.18328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EventCrab: Harnessing Frame and Point Synergy for Event-based Action Recognition and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meiqi Cao</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiangbo Shu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiachao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zechao Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18328v1-abstract-short" style="display: inline;"> Event-based Action Recognition (EAR) possesses the advantages of high-temporal resolution capturing and privacy preservation compared with traditional action recognition. Current leading EAR solutions typically follow two regimes: project unconstructed event streams into dense constructed event frames and adopt powerful frame-specific networks, or employ lightweight point-specific networks to hand… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18328v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18328v1-abstract-full" style="display: none;"> Event-based Action Recognition (EAR) possesses the advantages of high-temporal resolution capturing and privacy preservation compared with traditional action recognition. Current leading EAR solutions typically follow two regimes: project unconstructed event streams into dense constructed event frames and adopt powerful frame-specific networks, or employ lightweight point-specific networks to handle sparse unconstructed event points directly. However, such two regimes are blind to a fundamental issue: failing to accommodate the unique dense temporal and sparse spatial properties of asynchronous event data. In this article, we present a synergy-aware framework, i.e., EventCrab, that adeptly integrates the "lighter" frame-specific networks for dense event frames with the "heavier" point-specific networks for sparse event points, balancing accuracy and efficiency. Furthermore, we establish a joint frame-text-point representation space to bridge distinct event frames and points. In specific, to better exploit the unique spatiotemporal relationships inherent in asynchronous event points, we devise two strategies for the "heavier" point-specific embedding: i) a Spiking-like Context Learner (SCL) that extracts contextualized event points from raw event streams. ii) an Event Point Encoder (EPE) that further explores event-point long spatiotemporal features in a Hilbert-scan way. Experiments on four datasets demonstrate the significant performance of our proposed EventCrab, particularly gaining improvements of 5.17% on SeAct and 7.01% on HARDVS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18328v1-abstract-full').style.display = 'none'; document.getElementById('2411.18328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18276">arXiv:2411.18276</a> <span> [<a href="https://arxiv.org/pdf/2411.18276">pdf</a>, <a href="https://arxiv.org/format/2411.18276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GAPartManip: A Large-scale Part-centric Dataset for Material-Agnostic Articulated Object Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+W">Wenbo Cui</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Songlin Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiazhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+H">Haoran Geng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaran Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">He Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18276v1-abstract-short" style="display: inline;"> Effectively manipulating articulated objects in household scenarios is a crucial step toward achieving general embodied artificial intelligence. Mainstream research in 3D vision has primarily focused on manipulation through depth perception and pose detection. However, in real-world environments, these methods often face challenges due to imperfect depth perception, such as with transparent lids a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18276v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18276v1-abstract-full" style="display: none;"> Effectively manipulating articulated objects in household scenarios is a crucial step toward achieving general embodied artificial intelligence. Mainstream research in 3D vision has primarily focused on manipulation through depth perception and pose detection. However, in real-world environments, these methods often face challenges due to imperfect depth perception, such as with transparent lids and reflective handles. Moreover, they generally lack the diversity in part-based interactions required for flexible and adaptable manipulation. To address these challenges, we introduced a large-scale part-centric dataset for articulated object manipulation that features both photo-realistic material randomizations and detailed annotations of part-oriented, scene-level actionable interaction poses. We evaluated the effectiveness of our dataset by integrating it with several state-of-the-art methods for depth estimation and interaction pose prediction. Additionally, we proposed a novel modular framework that delivers superior and robust performance for generalizable articulated object manipulation. Our extensive experiments demonstrate that our dataset significantly improves the performance of depth perception and actionable interaction pose prediction in both simulation and real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18276v1-abstract-full').style.display = 'none'; document.getElementById('2411.18276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18179">arXiv:2411.18179</a> <span> [<a href="https://arxiv.org/pdf/2411.18179">pdf</a>, <a href="https://arxiv.org/format/2411.18179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prediction with Action: Visual Policy Learning via Joint Denoising Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanjiang Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yucheng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianke Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yen-Jen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chaochao Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18179v1-abstract-short" style="display: inline;"> Diffusion models have demonstrated remarkable capabilities in image generation tasks, including image editing and video creation, representing a good understanding of the physical world. On the other line, diffusion models have also shown promise in robotic control tasks by denoising actions, known as diffusion policy. Although the diffusion generative model and diffusion policy exhibit distinct c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18179v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18179v1-abstract-full" style="display: none;"> Diffusion models have demonstrated remarkable capabilities in image generation tasks, including image editing and video creation, representing a good understanding of the physical world. On the other line, diffusion models have also shown promise in robotic control tasks by denoising actions, known as diffusion policy. Although the diffusion generative model and diffusion policy exhibit distinct capabilities--image prediction and robotic action, respectively--they technically follow a similar denoising process. In robotic tasks, the ability to predict future images and generate actions is highly correlated since they share the same underlying dynamics of the physical world. Building on this insight, we introduce PAD, a novel visual policy learning framework that unifies image Prediction and robot Action within a joint Denoising process. Specifically, PAD utilizes Diffusion Transformers (DiT) to seamlessly integrate images and robot states, enabling the simultaneous prediction of future images and robot actions. Additionally, PAD supports co-training on both robotic demonstrations and large-scale video datasets and can be easily extended to other robotic modalities, such as depth images. PAD outperforms previous methods, achieving a significant 26.3% relative improvement on the full Metaworld benchmark, by utilizing a single text-conditioned visual policy within a data-efficient imitation learning setting. Furthermore, PAD demonstrates superior generalization to unseen tasks in real-world robot manipulation settings with 28.0% success rate increase compared to the strongest baseline. Project page at https://sites.google.com/view/pad-paper <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18179v1-abstract-full').style.display = 'none'; document.getElementById('2411.18179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18138">arXiv:2411.18138</a> <span> [<a href="https://arxiv.org/pdf/2411.18138">pdf</a>, <a href="https://arxiv.org/format/2411.18138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SALMONN-omni: A Codec-free LLM for Full-duplex Speech Understanding and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyin Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianzhao Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xiaohai Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lu Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18138v1-abstract-short" style="display: inline;"> Full-duplex multimodal large language models (LLMs) provide a unified framework for addressing diverse speech understanding and generation tasks, enabling more natural and seamless human-machine conversations. Unlike traditional modularised conversational AI systems, which separate speech recognition, understanding, and text-to-speech generation into distinct components, multimodal LLMs operate as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18138v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18138v1-abstract-full" style="display: none;"> Full-duplex multimodal large language models (LLMs) provide a unified framework for addressing diverse speech understanding and generation tasks, enabling more natural and seamless human-machine conversations. Unlike traditional modularised conversational AI systems, which separate speech recognition, understanding, and text-to-speech generation into distinct components, multimodal LLMs operate as single end-to-end models. This streamlined design eliminates error propagation across components and fully leverages the rich non-verbal information embedded in input speech signals. We introduce SALMONN-omni, a codec-free, full-duplex speech understanding and generation model capable of simultaneously listening to its own generated speech and background sounds while speaking. To support this capability, we propose a novel duplex spoken dialogue framework incorporating a ``thinking'' mechanism that facilitates asynchronous text and speech generation relying on embeddings instead of codecs (quantized speech and audio tokens). Experimental results demonstrate SALMONN-omni's versatility across a broad range of streaming speech tasks, including speech recognition, speech enhancement, and spoken question answering. Additionally, SALMONN-omni excels at managing turn-taking, barge-in, and echo cancellation scenarios, establishing its potential as a robust prototype for full-duplex conversational AI systems. To the best of our knowledge, SALMONN-omni is the first codec-free model of its kind. A full technical report along with model checkpoints will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18138v1-abstract-full').style.display = 'none'; document.getElementById('2411.18138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18011">arXiv:2411.18011</a> <span> [<a href="https://arxiv.org/pdf/2411.18011">pdf</a>, <a href="https://arxiv.org/format/2411.18011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Manual-PA: Learning 3D Part Assembly from Instruction Diagrams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiahao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cherian%2C+A">Anoop Cherian</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+C">Cristian Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Weijian Deng</a>, <a href="/search/cs?searchtype=author&query=Gould%2C+S">Stephen Gould</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18011v1-abstract-short" style="display: inline;"> Assembling furniture amounts to solving the discrete-continuous optimization task of selecting the furniture parts to assemble and estimating their connecting poses in a physically realistic manner. The problem is hampered by its combinatorially large yet sparse solution space thus making learning to assemble a challenging task for current machine learning models. In this paper, we attempt to solv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18011v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18011v1-abstract-full" style="display: none;"> Assembling furniture amounts to solving the discrete-continuous optimization task of selecting the furniture parts to assemble and estimating their connecting poses in a physically realistic manner. The problem is hampered by its combinatorially large yet sparse solution space thus making learning to assemble a challenging task for current machine learning models. In this paper, we attempt to solve this task by leveraging the assembly instructions provided in diagrammatic manuals that typically accompany the furniture parts. Our key insight is to use the cues in these diagrams to split the problem into discrete and continuous phases. Specifically, we present Manual-PA, a transformer-based instruction Manual-guided 3D Part Assembly framework that learns to semantically align 3D parts with their illustrations in the manuals using a contrastive learning backbone towards predicting the assembly order and infers the 6D pose of each part via relating it to the final furniture depicted in the manual. To validate the efficacy of our method, we conduct experiments on the benchmark PartNet dataset. Our results show that using the diagrams and the order of the parts lead to significant improvements in assembly performance against the state of the art. Further, Manual-PA demonstrates strong generalization to real-world IKEA furniture assembly on the IKEA-Manual dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18011v1-abstract-full').style.display = 'none'; document.getElementById('2411.18011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17864">arXiv:2411.17864</a> <span> [<a href="https://arxiv.org/pdf/2411.17864">pdf</a>, <a href="https://arxiv.org/format/2411.17864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Image Layer Decomposition with Visual Effects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinrui Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijun Li</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Pakhomov%2C+D">Daniil Pakhomov</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Cihang Xie</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuyin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17864v1-abstract-short" style="display: inline;"> Recent advancements in large generative models, particularly diffusion-based methods, have significantly enhanced the capabilities of image editing. However, achieving precise control over image composition tasks remains a challenge. Layered representations, which allow for independent editing of image components, are essential for user-driven content creation, yet existing approaches often strugg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17864v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17864v1-abstract-full" style="display: none;"> Recent advancements in large generative models, particularly diffusion-based methods, have significantly enhanced the capabilities of image editing. However, achieving precise control over image composition tasks remains a challenge. Layered representations, which allow for independent editing of image components, are essential for user-driven content creation, yet existing approaches often struggle to decompose image into plausible layers with accurately retained transparent visual effects such as shadows and reflections. We propose $\textbf{LayerDecomp}$, a generative framework for image layer decomposition which outputs photorealistic clean backgrounds and high-quality transparent foregrounds with faithfully preserved visual effects. To enable effective training, we first introduce a dataset preparation pipeline that automatically scales up simulated multi-layer data with synthesized visual effects. To further enhance real-world applicability, we supplement this simulated dataset with camera-captured images containing natural visual effects. Additionally, we propose a consistency loss which enforces the model to learn accurate representations for the transparent foreground layer when ground-truth annotations are not available. Our method achieves superior quality in layer decomposition, outperforming existing approaches in object removal and spatial editing tasks across several benchmarks and multiple user studies, unlocking various creative possibilities for layer-wise image editing. The project page is https://rayjryang.github.io/LayerDecomp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17864v1-abstract-full').style.display = 'none'; document.getElementById('2411.17864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project page: https://rayjryang.github.io/LayerDecomp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17832">arXiv:2411.17832</a> <span> [<a href="https://arxiv.org/pdf/2411.17832">pdf</a>, <a href="https://arxiv.org/format/2411.17832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SVGDreamer++: Advancing Editability and Diversity in Text-Guided SVG Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Ximing Xing</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chuang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haitao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17832v1-abstract-short" style="display: inline;"> Recently, text-guided scalable vector graphics (SVG) synthesis has demonstrated significant potential in domains such as iconography and sketching. However, SVGs generated from existing Text-to-SVG methods often lack editability and exhibit deficiencies in visual quality and diversity. In this paper, we propose a novel text-guided vector graphics synthesis method to address these limitations. To i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17832v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17832v1-abstract-full" style="display: none;"> Recently, text-guided scalable vector graphics (SVG) synthesis has demonstrated significant potential in domains such as iconography and sketching. However, SVGs generated from existing Text-to-SVG methods often lack editability and exhibit deficiencies in visual quality and diversity. In this paper, we propose a novel text-guided vector graphics synthesis method to address these limitations. To improve the diversity of output SVGs, we present a Vectorized Particle-based Score Distillation (VPSD) approach. VPSD addresses over-saturation issues in existing methods and enhances sample diversity. A pre-trained reward model is incorporated to re-weight vector particles, improving aesthetic appeal and enabling faster convergence. Additionally, we design a novel adaptive vector primitives control strategy, which allows for the dynamic adjustment of the number of primitives, thereby enhancing the presentation of graphic details. Extensive experiments validate the effectiveness of the proposed method, demonstrating its superiority over baseline methods in terms of editability, visual quality, and diversity. We also show that our new method supports up to six distinct vector styles, capable of generating high-quality vector assets suitable for stylized vector design and poster design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17832v1-abstract-full').style.display = 'none'; document.getElementById('2411.17832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 17 figures. arXiv admin note: substantial text overlap with arXiv:2312.16476</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17820">arXiv:2411.17820</a> <span> [<a href="https://arxiv.org/pdf/2411.17820">pdf</a>, <a href="https://arxiv.org/format/2411.17820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CityWalker: Learning Embodied Urban Navigation from Web-Scale Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinhao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jintong Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yicheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Sujay%2C+N">Niranjan Sujay</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhicheng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Juexiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Abanes%2C+J">John Abanes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17820v2-abstract-short" style="display: inline;"> Navigating dynamic urban environments presents significant challenges for embodied agents, requiring advanced spatial reasoning and adherence to common-sense norms. Despite progress, existing visual navigation methods struggle in map-free or off-street settings, limiting the deployment of autonomous agents like last-mile delivery robots. To overcome these obstacles, we propose a scalable, data-dri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17820v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17820v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17820v2-abstract-full" style="display: none;"> Navigating dynamic urban environments presents significant challenges for embodied agents, requiring advanced spatial reasoning and adherence to common-sense norms. Despite progress, existing visual navigation methods struggle in map-free or off-street settings, limiting the deployment of autonomous agents like last-mile delivery robots. To overcome these obstacles, we propose a scalable, data-driven approach for human-like urban navigation by training agents on thousands of hours of in-the-wild city walking and driving videos sourced from the web. We introduce a simple and scalable data processing pipeline that extracts action supervision from these videos, enabling large-scale imitation learning without costly annotations. Our model learns sophisticated navigation policies to handle diverse challenges and critical scenarios. Experimental results show that training on large-scale, diverse datasets significantly enhances navigation performance, surpassing current methods. This work shows the potential of using abundant online video data to develop robust navigation policies for embodied agents in dynamic urban settings. Project homepage is at https://ai4ce.github.io/CityWalker/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17820v2-abstract-full').style.display = 'none'; document.getElementById('2411.17820v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16746">arXiv:2411.16746</a> <span> [<a href="https://arxiv.org/pdf/2411.16746">pdf</a>, <a href="https://arxiv.org/format/2411.16746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LoBAM: LoRA-Based Backdoor Attack on Model Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingwei Sun</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Minghong Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hai Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16746v1-abstract-short" style="display: inline;"> Model merging is an emerging technique that integrates multiple models fine-tuned on different tasks to create a versatile model that excels in multiple domains. This scheme, in the meantime, may open up backdoor attack opportunities where one single malicious model can jeopardize the integrity of the merged model. Existing works try to demonstrate the risk of such attacks by assuming substantial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16746v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16746v1-abstract-full" style="display: none;"> Model merging is an emerging technique that integrates multiple models fine-tuned on different tasks to create a versatile model that excels in multiple domains. This scheme, in the meantime, may open up backdoor attack opportunities where one single malicious model can jeopardize the integrity of the merged model. Existing works try to demonstrate the risk of such attacks by assuming substantial computational resources, focusing on cases where the attacker can fully fine-tune the pre-trained model. Such an assumption, however, may not be feasible given the increasing size of machine learning models. In practice where resources are limited and the attacker can only employ techniques like Low-Rank Adaptation (LoRA) to produce the malicious model, it remains unclear whether the attack can still work and pose threats. In this work, we first identify that the attack efficacy is significantly diminished when using LoRA for fine-tuning. Then, we propose LoBAM, a method that yields high attack success rate with minimal training resources. The key idea of LoBAM is to amplify the malicious weights in an intelligent way that effectively enhances the attack efficacy. We demonstrate that our design can lead to improved attack success rate through both theoretical proof and extensive empirical experiments across various model merging scenarios. Moreover, we show that our method has strong stealthiness and is difficult to detect. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16746v1-abstract-full').style.display = 'none'; document.getElementById('2411.16746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16481">arXiv:2411.16481</a> <span> [<a href="https://arxiv.org/pdf/2411.16481">pdf</a>, <a href="https://arxiv.org/format/2411.16481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deformable Mamba for Wide Field of View Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Junwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiale Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Stiefelhagen%2C+R">Rainer Stiefelhagen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16481v1-abstract-short" style="display: inline;"> Wide-FoV cameras, like fisheye and panoramic setups, are essential for broader perception but introduce significant distortions in 180掳 and 360掳 images, complicating dense prediction tasks. For instance, existing MAMBA models lacking distortion-aware capacity cannot perform well in panoramic semantic segmentation. To address this problem, this work presents Deformable Mamba, a unified framework sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16481v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16481v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16481v1-abstract-full" style="display: none;"> Wide-FoV cameras, like fisheye and panoramic setups, are essential for broader perception but introduce significant distortions in 180掳 and 360掳 images, complicating dense prediction tasks. For instance, existing MAMBA models lacking distortion-aware capacity cannot perform well in panoramic semantic segmentation. To address this problem, this work presents Deformable Mamba, a unified framework specifically designed to address imaging distortions within the context of panoramic and fisheye semantic segmentation. At the core is a decoder constructed with a series of Deformable Mamba Fusion (DMF) blocks, making the whole framework more deformable, efficient, and accurate, when handling extreme distortions. Extensive evaluations across five datasets demonstrate that our method consistently improves segmentation accuracy compared to the previous state-of-the-art methods tailored for specific FoVs. Notably, Deformable Mamba achieves a +2.5% performance improvement on the 360掳 Stanford2D3D dataset, and shows better results across FoVs from 60掳 to 360掳. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16481v1-abstract-full').style.display = 'none'; document.getElementById('2411.16481v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Models and code will be made publicly available at: https://github.com/JieHu1996/DeformableMamba</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16331">arXiv:2411.16331</a> <span> [<a href="https://arxiv.org/pdf/2411.16331">pdf</a>, <a href="https://arxiv.org/format/2411.16331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sonic: Shifting Focus to Global Audio Perception in Portrait Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiaozhong Ji</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhihong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Junwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chuming Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qingdong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qin Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qinglin Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16331v1-abstract-short" style="display: inline;"> The study of talking face generation mainly explores the intricacies of synchronizing facial movements and crafting visually appealing, temporally-coherent animations. However, due to the limited exploration of global audio perception, current approaches predominantly employ auxiliary visual and spatial knowledge to stabilize the movements, which often results in the deterioration of the naturalne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16331v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16331v1-abstract-full" style="display: none;"> The study of talking face generation mainly explores the intricacies of synchronizing facial movements and crafting visually appealing, temporally-coherent animations. However, due to the limited exploration of global audio perception, current approaches predominantly employ auxiliary visual and spatial knowledge to stabilize the movements, which often results in the deterioration of the naturalness and temporal inconsistencies.Considering the essence of audio-driven animation, the audio signal serves as the ideal and unique priors to adjust facial expressions and lip movements, without resorting to interference of any visual signals. Based on this motivation, we propose a novel paradigm, dubbed as Sonic, to {s}hift f{o}cus on the exploration of global audio per{c}ept{i}o{n}.To effectively leverage global audio knowledge, we disentangle it into intra- and inter-clip audio perception and collaborate with both aspects to enhance overall perception.For the intra-clip audio perception, 1). \textbf{Context-enhanced audio learning}, in which long-range intra-clip temporal audio knowledge is extracted to provide facial expression and lip motion priors implicitly expressed as the tone and speed of speech. 2). \textbf{Motion-decoupled controller}, in which the motion of the head and expression movement are disentangled and independently controlled by intra-audio clips. Most importantly, for inter-clip audio perception, as a bridge to connect the intra-clips to achieve the global perception, \textbf{Time-aware position shift fusion}, in which the global inter-clip audio information is considered and fused for long-audio inference via through consecutively time-aware shifted windows. Extensive experiments demonstrate that the novel audio-driven paradigm outperform existing SOTA methodologies in terms of video quality, temporally consistency, lip synchronization precision, and motion diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16331v1-abstract-full').style.display = 'none'; document.getElementById('2411.16331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">refer to our main-page \url{https://jixiaozhong.github.io/Sonic/}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16170">arXiv:2411.16170</a> <span> [<a href="https://arxiv.org/pdf/2411.16170">pdf</a>, <a href="https://arxiv.org/format/2411.16170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CARE Transformer: Mobile-Friendly Linear Visual Transformer via Decoupled Dual Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qingshan Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiequan Cui</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junbao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+R">Richang Hong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16170v1-abstract-short" style="display: inline;"> Recently, large efforts have been made to design efficient linear-complexity visual Transformers. However, current linear attention models are generally unsuitable to be deployed in resource-constrained mobile devices, due to suffering from either few efficiency gains or significant accuracy drops. In this paper, we propose a new de\textbf{C}oupled du\textbf{A}l-interactive linea\textbf{R} att\tex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16170v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16170v1-abstract-full" style="display: none;"> Recently, large efforts have been made to design efficient linear-complexity visual Transformers. However, current linear attention models are generally unsuitable to be deployed in resource-constrained mobile devices, due to suffering from either few efficiency gains or significant accuracy drops. In this paper, we propose a new de\textbf{C}oupled du\textbf{A}l-interactive linea\textbf{R} att\textbf{E}ntion (CARE) mechanism, revealing that features' decoupling and interaction can fully unleash the power of linear attention. We first propose an asymmetrical feature decoupling strategy that asymmetrically decouples the learning process for local inductive bias and long-range dependencies, thereby preserving sufficient local and global information while effectively enhancing the efficiency of models. Then, a dynamic memory unit is employed to maintain critical information along the network pipeline. Moreover, we design a dual interaction module to effectively facilitate interaction between local inductive bias and long-range information as well as among features at different layers. By adopting a decoupled learning way and fully exploiting complementarity across features, our method can achieve both high efficiency and accuracy. Extensive experiments on ImageNet-1K, COCO, and ADE20K datasets demonstrate the effectiveness of our approach, e.g., achieving $78.4/82.1\%$ top-1 accuracy on ImagegNet-1K at the cost of only $0.7/1.9$ GMACs. Codes will be released on \href{..}{github}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16170v1-abstract-full').style.display = 'none'; document.getElementById('2411.16170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16064">arXiv:2411.16064</a> <span> [<a href="https://arxiv.org/pdf/2411.16064">pdf</a>, <a href="https://arxiv.org/format/2411.16064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Granularity Class Prototype Topology Distillation for Class-Incremental Source-Free Unsupervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+P">Peihua Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiehua Zhang</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+X">Xichun Sheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chenggang Yan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yaoqi Sun</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Ying Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16064v1-abstract-short" style="display: inline;"> This paper explores the Class-Incremental Source-Free Unsupervised Domain Adaptation (CI-SFUDA) problem, where the unlabeled target data come incrementally without access to labeled source instances. This problem poses two challenges, the disturbances of similar source-class knowledge to target-class representation learning and the new target knowledge to old ones. To address them, we propose the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16064v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16064v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16064v1-abstract-full" style="display: none;"> This paper explores the Class-Incremental Source-Free Unsupervised Domain Adaptation (CI-SFUDA) problem, where the unlabeled target data come incrementally without access to labeled source instances. This problem poses two challenges, the disturbances of similar source-class knowledge to target-class representation learning and the new target knowledge to old ones. To address them, we propose the Multi-Granularity Class Prototype Topology Distillation (GROTO) algorithm, which effectively transfers the source knowledge to the unlabeled class-incremental target domain. Concretely, we design the multi-granularity class prototype self-organization module and prototype topology distillation module. Firstly, the positive classes are mined by modeling two accumulation distributions. Then, we generate reliable pseudo-labels by introducing multi-granularity class prototypes, and use them to promote the positive-class target feature self-organization. Secondly, the positive-class prototypes are leveraged to construct the topological structures of source and target feature spaces. Then, we perform the topology distillation to continually mitigate the interferences of new target knowledge to old ones. Extensive experiments demonstrate that our proposed method achieves state-of-the-art performances on three public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16064v1-abstract-full').style.display = 'none'; document.getElementById('2411.16064v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15941">arXiv:2411.15941</a> <span> [<a href="https://arxiv.org/pdf/2411.15941">pdf</a>, <a href="https://arxiv.org/format/2411.15941">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MobileMamba: Lightweight Multi-Receptive Visual Mamba Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Haoyang He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuxuan Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongxu Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zhenye Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yabiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunsheng Wu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15941v1-abstract-short" style="display: inline;"> Previous research on lightweight models has primarily focused on CNNs and Transformer-based designs. CNNs, with their local receptive fields, struggle to capture long-range dependencies, while Transformers, despite their global modeling capabilities, are limited by quadratic computational complexity in high-resolution scenarios. Recently, state-space models have gained popularity in the visual dom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15941v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15941v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15941v1-abstract-full" style="display: none;"> Previous research on lightweight models has primarily focused on CNNs and Transformer-based designs. CNNs, with their local receptive fields, struggle to capture long-range dependencies, while Transformers, despite their global modeling capabilities, are limited by quadratic computational complexity in high-resolution scenarios. Recently, state-space models have gained popularity in the visual domain due to their linear computational complexity. Despite their low FLOPs, current lightweight Mamba-based models exhibit suboptimal throughput. In this work, we propose the MobileMamba framework, which balances efficiency and performance. We design a three-stage network to enhance inference speed significantly. At a fine-grained level, we introduce the Multi-Receptive Field Feature Interaction(MRFFI) module, comprising the Long-Range Wavelet Transform-Enhanced Mamba(WTE-Mamba), Efficient Multi-Kernel Depthwise Convolution(MK-DeConv), and Eliminate Redundant Identity components. This module integrates multi-receptive field information and enhances high-frequency detail extraction. Additionally, we employ training and testing strategies to further improve performance and efficiency. MobileMamba achieves up to 83.6% on Top-1, surpassing existing state-of-the-art methods which is maximum x21 faster than LocalVim on GPU. Extensive experiments on high-resolution downstream tasks demonstrate that MobileMamba surpasses current efficient models, achieving an optimal balance between speed and accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15941v1-abstract-full').style.display = 'none'; document.getElementById('2411.15941v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15843">arXiv:2411.15843</a> <span> [<a href="https://arxiv.org/pdf/2411.15843">pdf</a>, <a href="https://arxiv.org/format/2411.15843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unveil Inversion and Invariance in Flow Transformer for Versatile Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pengcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qingdong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunsheng Wu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Charles Ling</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15843v2-abstract-short" style="display: inline;"> Leveraging the large generative prior of the flow transformer for tuning-free image editing requires authentic inversion to project the image into the model's domain and a flexible invariance control mechanism to preserve non-target contents. However, the prevailing diffusion inversion performs deficiently in flow-based models, and the invariance control cannot reconcile diverse rigid and non-rigi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15843v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15843v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15843v2-abstract-full" style="display: none;"> Leveraging the large generative prior of the flow transformer for tuning-free image editing requires authentic inversion to project the image into the model's domain and a flexible invariance control mechanism to preserve non-target contents. However, the prevailing diffusion inversion performs deficiently in flow-based models, and the invariance control cannot reconcile diverse rigid and non-rigid editing tasks. To address these, we systematically analyze the \textbf{inversion and invariance} control based on the flow transformer. Specifically, we unveil that the Euler inversion shares a similar structure to DDIM yet is more susceptible to the approximation error. Thus, we propose a two-stage inversion to first refine the velocity estimation and then compensate for the leftover error, which pivots closely to the model prior and benefits editing. Meanwhile, we propose the invariance control that manipulates the text features within the adaptive layer normalization, connecting the changes in the text prompt to image semantics. This mechanism can simultaneously preserve the non-target contents while allowing rigid and non-rigid manipulation, enabling a wide range of editing types such as visual text, quantity, facial expression, etc. Experiments on versatile scenarios validate that our framework achieves flexible and accurate editing, unlocking the potential of the flow transformer for versatile image editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15843v2-abstract-full').style.display = 'none'; document.getElementById('2411.15843v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://pengchengpcx.github.io/EditFT/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15604">arXiv:2411.15604</a> <span> [<a href="https://arxiv.org/pdf/2411.15604">pdf</a>, <a href="https://arxiv.org/format/2411.15604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FATE: Full-head Gaussian Avatar with Textural Editing from Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zijian Wu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhiyang Liang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yicheng Gong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dongfang Hu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15604v1-abstract-short" style="display: inline;"> Reconstructing high-fidelity, animatable 3D head avatars from effortlessly captured monocular videos is a pivotal yet formidable challenge. Although significant progress has been made in rendering performance and manipulation capabilities, notable challenges remain, including incomplete reconstruction and inefficient Gaussian representation. To address these challenges, we introduce FATE, a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15604v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15604v1-abstract-full" style="display: none;"> Reconstructing high-fidelity, animatable 3D head avatars from effortlessly captured monocular videos is a pivotal yet formidable challenge. Although significant progress has been made in rendering performance and manipulation capabilities, notable challenges remain, including incomplete reconstruction and inefficient Gaussian representation. To address these challenges, we introduce FATE, a novel method for reconstructing an editable full-head avatar from a single monocular video. FATE integrates a sampling-based densification strategy to ensure optimal positional distribution of points, improving rendering efficiency. A neural baking technique is introduced to convert discrete Gaussian representations into continuous attribute maps, facilitating intuitive appearance editing. Furthermore, we propose a universal completion framework to recover non-frontal appearance, culminating in a 360$^\circ$-renderable 3D head avatar. FATE outperforms previous approaches in both qualitative and quantitative evaluations, achieving state-of-the-art performance. To the best of our knowledge, FATE is the first animatable and 360$^\circ$ full-head monocular reconstruction method for a 3D head avatar. The code will be publicly released upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15604v1-abstract-full').style.display = 'none'; document.getElementById('2411.15604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://zjwfufu.github.io/FATE-page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15600">arXiv:2411.15600</a> <span> [<a href="https://arxiv.org/pdf/2411.15600">pdf</a>, <a href="https://arxiv.org/format/2411.15600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> How Texts Help? A Fine-grained Evaluation to Reveal the Role of Language in Vision-Language Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuchen Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiyu Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiaokun Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dailing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meiqi Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaiqi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15600v1-abstract-short" style="display: inline;"> Vision-language tracking (VLT) extends traditional single object tracking by incorporating textual information, providing semantic guidance to enhance tracking performance under challenging conditions like fast motion and deformations. However, current VLT trackers often underperform compared to single-modality methods on multiple benchmarks, with semantic information sometimes becoming a "distrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15600v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15600v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15600v1-abstract-full" style="display: none;"> Vision-language tracking (VLT) extends traditional single object tracking by incorporating textual information, providing semantic guidance to enhance tracking performance under challenging conditions like fast motion and deformations. However, current VLT trackers often underperform compared to single-modality methods on multiple benchmarks, with semantic information sometimes becoming a "distraction." To address this, we propose VLTVerse, the first fine-grained evaluation framework for VLT trackers that comprehensively considers multiple challenge factors and diverse semantic information, hoping to reveal the role of language in VLT. Our contributions include: (1) VLTVerse introduces 10 sequence-level challenge labels and 6 types of multi-granularity semantic information, creating a flexible and multi-dimensional evaluation space for VLT; (2) leveraging 60 subspaces formed by combinations of challenge factors and semantic types, we conduct systematic fine-grained evaluations of three mainstream SOTA VLT trackers, uncovering their performance bottlenecks across complex scenarios and offering a novel perspective on VLT evaluation; (3) through decoupled analysis of experimental results, we examine the impact of various semantic types on specific challenge factors in relation to different algorithms, providing essential guidance for enhancing VLT across data, evaluation, and algorithmic dimensions. The VLTVerse, toolkit, and results will be available at \url{http://metaverse.aitestunion.com}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15600v1-abstract-full').style.display = 'none'; document.getElementById('2411.15600v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15411">arXiv:2411.15411</a> <span> [<a href="https://arxiv.org/pdf/2411.15411">pdf</a>, <a href="https://arxiv.org/format/2411.15411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FINECAPTION: Compositional Image Captioning Focusing on Wherever You Want at Any Granularity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+H">Hang Hua</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jing Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yilin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15411v1-abstract-short" style="display: inline;"> The advent of large Vision-Language Models (VLMs) has significantly advanced multimodal tasks, enabling more sophisticated and accurate reasoning across various applications, including image and video captioning, visual question answering, and cross-modal retrieval. Despite their superior capabilities, VLMs struggle with fine-grained image regional composition information perception. Specifically,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15411v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15411v1-abstract-full" style="display: none;"> The advent of large Vision-Language Models (VLMs) has significantly advanced multimodal tasks, enabling more sophisticated and accurate reasoning across various applications, including image and video captioning, visual question answering, and cross-modal retrieval. Despite their superior capabilities, VLMs struggle with fine-grained image regional composition information perception. Specifically, they have difficulty accurately aligning the segmentation masks with the corresponding semantics and precisely describing the compositional aspects of the referred regions. However, compositionality - the ability to understand and generate novel combinations of known visual and textual components - is critical for facilitating coherent reasoning and understanding across modalities by VLMs. To address this issue, we propose FINECAPTION, a novel VLM that can recognize arbitrary masks as referential inputs and process high-resolution images for compositional image captioning at different granularity levels. To support this endeavor, we introduce COMPOSITIONCAP, a new dataset for multi-grained region compositional image captioning, which introduces the task of compositional attribute-aware regional image captioning. Empirical results demonstrate the effectiveness of our proposed model compared to other state-of-the-art VLMs. Additionally, we analyze the capabilities of current VLMs in recognizing various visual prompts for compositional region image captioning, highlighting areas for improvement in VLM design and training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15411v1-abstract-full').style.display = 'none'; document.getElementById('2411.15411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15276">arXiv:2411.15276</a> <span> [<a href="https://arxiv.org/pdf/2411.15276">pdf</a>, <a href="https://arxiv.org/format/2411.15276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Event USKT : U-State Space Model in Knowledge Transfer for Event Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuhui Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiahao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jimin Xiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Ding Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjun Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiaxuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15276v1-abstract-short" style="display: inline;"> Event cameras, as an emerging imaging technology, offer distinct advantages over traditional RGB cameras, including reduced energy consumption and higher frame rates. However, the limited quantity of available event data presents a significant challenge, hindering their broader development. To alleviate this issue, we introduce a tailored U-shaped State Space Model Knowledge Transfer (USKT) framew… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15276v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15276v1-abstract-full" style="display: none;"> Event cameras, as an emerging imaging technology, offer distinct advantages over traditional RGB cameras, including reduced energy consumption and higher frame rates. However, the limited quantity of available event data presents a significant challenge, hindering their broader development. To alleviate this issue, we introduce a tailored U-shaped State Space Model Knowledge Transfer (USKT) framework for Event-to-RGB knowledge transfer. This framework generates inputs compatible with RGB frames, enabling event data to effectively reuse pre-trained RGB models and achieve competitive performance with minimal parameter tuning. Within the USKT architecture, we also propose a bidirectional reverse state space model. Unlike conventional bidirectional scanning mechanisms, the proposed Bidirectional Reverse State Space Model (BiR-SSM) leverages a shared weight strategy, which facilitates efficient modeling while conserving computational resources. In terms of effectiveness, integrating USKT with ResNet50 as the backbone improves model performance by 0.95%, 3.57%, and 2.9% on DVS128 Gesture, N-Caltech101, and CIFAR-10-DVS datasets, respectively, underscoring USKT's adaptability and effectiveness. The code will be made available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15276v1-abstract-full').style.display = 'none'; document.getElementById('2411.15276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15255">arXiv:2411.15255</a> <span> [<a href="https://arxiv.org/pdf/2411.15255">pdf</a>, <a href="https://arxiv.org/format/2411.15255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OSMamba: Omnidirectional Spectral Mamba with Dual-Domain Prior Generator for Exposure Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Gehui Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15255v1-abstract-short" style="display: inline;"> Exposure correction is a fundamental problem in computer vision and image processing. Recently, frequency domain-based methods have achieved impressive improvement, yet they still struggle with complex real-world scenarios under extreme exposure conditions. This is due to the local convolutional receptive fields failing to model long-range dependencies in the spectrum, and the non-generative learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15255v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15255v1-abstract-full" style="display: none;"> Exposure correction is a fundamental problem in computer vision and image processing. Recently, frequency domain-based methods have achieved impressive improvement, yet they still struggle with complex real-world scenarios under extreme exposure conditions. This is due to the local convolutional receptive fields failing to model long-range dependencies in the spectrum, and the non-generative learning paradigm being inadequate for retrieving lost details from severely degraded regions. In this paper, we propose Omnidirectional Spectral Mamba (OSMamba), a novel exposure correction network that incorporates the advantages of state space models and generative diffusion models to address these limitations. Specifically, OSMamba introduces an omnidirectional spectral scanning mechanism that adapts Mamba to the frequency domain to capture comprehensive long-range dependencies in both the amplitude and phase spectra of deep image features, hence enhancing illumination correction and structure recovery. Furthermore, we develop a dual-domain prior generator that learns from well-exposed images to generate a degradation-free diffusion prior containing correct information about severely under- and over-exposed regions for better detail restoration. Extensive experiments on multiple-exposure and mixed-exposure datasets demonstrate that the proposed OSMamba achieves state-of-the-art performance both quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15255v1-abstract-full').style.display = 'none'; document.getElementById('2411.15255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14847">arXiv:2411.14847</a> <span> [<a href="https://arxiv.org/pdf/2411.14847">pdf</a>, <a href="https://arxiv.org/format/2411.14847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamics-Aware Gaussian Splatting Streaming Towards Fast On-the-Fly Training for 4D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhening Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yingdong Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jiawei Shao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zehong Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14847v1-abstract-short" style="display: inline;"> The recent development of 3D Gaussian Splatting (3DGS) has led to great interest in 4D dynamic spatial reconstruction from multi-view visual inputs. While existing approaches mainly rely on processing full-length multi-view videos for 4D reconstruction, there has been limited exploration of iterative online reconstruction methods that enable on-the-fly training and per-frame streaming. Current 3DG… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14847v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14847v1-abstract-full" style="display: none;"> The recent development of 3D Gaussian Splatting (3DGS) has led to great interest in 4D dynamic spatial reconstruction from multi-view visual inputs. While existing approaches mainly rely on processing full-length multi-view videos for 4D reconstruction, there has been limited exploration of iterative online reconstruction methods that enable on-the-fly training and per-frame streaming. Current 3DGS-based streaming methods treat the Gaussian primitives uniformly and constantly renew the densified Gaussians, thereby overlooking the difference between dynamic and static features and also neglecting the temporal continuity in the scene. To address these limitations, we propose a novel three-stage pipeline for iterative streamable 4D dynamic spatial reconstruction. Our pipeline comprises a selective inheritance stage to preserve temporal continuity, a dynamics-aware shift stage for distinguishing dynamic and static primitives and optimizing their movements, and an error-guided densification stage to accommodate emerging objects. Our method achieves state-of-the-art performance in online 4D reconstruction, demonstrating a 20% improvement in on-the-fly training speed, superior representation quality, and real-time rendering capability. Project page: https://www.liuzhening.top/DASS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14847v1-abstract-full').style.display = 'none'; document.getElementById('2411.14847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://www.liuzhening.top/DASS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14834">arXiv:2411.14834</a> <span> [<a href="https://arxiv.org/pdf/2411.14834">pdf</a>, <a href="https://arxiv.org/format/2411.14834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Gradient Masking All-at-Once: Ensemble Everything Everywhere Is Not Robust </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Nikoli%C4%87%2C+K">Kristina Nikoli膰</a>, <a href="/search/cs?searchtype=author&query=Carlini%2C+N">Nicholas Carlini</a>, <a href="/search/cs?searchtype=author&query=Tram%C3%A8r%2C+F">Florian Tram猫r</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14834v1-abstract-short" style="display: inline;"> Ensemble everything everywhere is a defense to adversarial examples that was recently proposed to make image classifiers robust. This defense works by ensembling a model's intermediate representations at multiple noisy image resolutions, producing a single robust classification. This defense was shown to be effective against multiple state-of-the-art attacks. Perhaps even more convincingly, it was… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14834v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14834v1-abstract-full" style="display: none;"> Ensemble everything everywhere is a defense to adversarial examples that was recently proposed to make image classifiers robust. This defense works by ensembling a model's intermediate representations at multiple noisy image resolutions, producing a single robust classification. This defense was shown to be effective against multiple state-of-the-art attacks. Perhaps even more convincingly, it was shown that the model's gradients are perceptually aligned: attacks against the model produce noise that perceptually resembles the targeted class. In this short note, we show that this defense is not robust to adversarial attack. We first show that the defense's randomness and ensembling method cause severe gradient masking. We then use standard adaptive attack techniques to reduce the defense's robust accuracy from 48% to 1% on CIFAR-100 and from 62% to 4% on CIFAR-10, under the $\ell_\infty$-norm threat model with $\varepsilon=8/255$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14834v1-abstract-full').style.display = 'none'; document.getElementById('2411.14834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14755">arXiv:2411.14755</a> <span> [<a href="https://arxiv.org/pdf/2411.14755">pdf</a>, <a href="https://arxiv.org/format/2411.14755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> FairAdapter: Detecting AI-generated Images with Improved Fairness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+F">Feng Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinan He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jianfeng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14755v1-abstract-short" style="display: inline;"> The high-quality, realistic images generated by generative models pose significant challenges for exposing them.So far, data-driven deep neural networks have been justified as the most efficient forensics tools for the challenges. However, they may be over-fitted to certain semantics, resulting in considerable inconsistency in detection performance across different contents of generated samples. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14755v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14755v1-abstract-full" style="display: none;"> The high-quality, realistic images generated by generative models pose significant challenges for exposing them.So far, data-driven deep neural networks have been justified as the most efficient forensics tools for the challenges. However, they may be over-fitted to certain semantics, resulting in considerable inconsistency in detection performance across different contents of generated samples. It could be regarded as an issue of detection fairness. In this paper, we propose a novel framework named Fairadapter to tackle the issue. In comparison with existing state-of-the-art methods, our model achieves improved fairness performance. Our project: https://github.com/AppleDogDog/FairnessDetection <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14755v1-abstract-full').style.display = 'none'; document.getElementById('2411.14755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14725">arXiv:2411.14725</a> <span> [<a href="https://arxiv.org/pdf/2411.14725">pdf</a>, <a href="https://arxiv.org/format/2411.14725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Advancing Multimodal Large Language Models in Ability Lens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feng Chen</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+C">Chenhui Gou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jing Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenbang Sun</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+B">Bohan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14725v1-abstract-short" style="display: inline;"> As multimodal large language models (MLLMs) advance rapidly, rigorous evaluation has become essential, providing further guidance for their development. In this work, we focus on a unified and robust evaluation of \textbf{vision perception} abilities, the foundational skill of MLLMs. We find that existing perception benchmarks, each focusing on different question types, domains, and evaluation met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14725v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14725v1-abstract-full" style="display: none;"> As multimodal large language models (MLLMs) advance rapidly, rigorous evaluation has become essential, providing further guidance for their development. In this work, we focus on a unified and robust evaluation of \textbf{vision perception} abilities, the foundational skill of MLLMs. We find that existing perception benchmarks, each focusing on different question types, domains, and evaluation metrics, introduce significant evaluation variance, complicating comprehensive assessments of perception abilities when relying on any single benchmark. To address this, we introduce \textbf{AbilityLens}, a unified benchmark designed to evaluate MLLMs across six key perception abilities, focusing on both accuracy and stability, with each ability encompassing diverse question types, domains, and metrics. With the assistance of AbilityLens, we: (1) identify the strengths and weaknesses of current models, highlighting stability patterns and revealing a notable performance gap between open-source and closed-source models; (2) introduce an online evaluation mode, which uncovers interesting ability conflict and early convergence phenomena during MLLM training; and (3) design a simple ability-specific model merging method that combines the best ability checkpoint from early training stages, effectively mitigating performance decline due to ability conflict. The benchmark and online leaderboard will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14725v1-abstract-full').style.display = 'none'; document.getElementById('2411.14725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14637">arXiv:2411.14637</a> <span> [<a href="https://arxiv.org/pdf/2411.14637">pdf</a>, <a href="https://arxiv.org/format/2411.14637">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Clinical Trial Patient Matching through Knowledge Augmentation with Multi-Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+H">Hanwen Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kunpeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14637v1-abstract-short" style="display: inline;"> Matching patients effectively and efficiently for clinical trials is a significant challenge due to the complexity and variability of patient profiles and trial criteria. This paper presents a novel framework, Multi-Agents for Knowledge Augmentation (MAKA), designed to enhance patient-trial matching by dynamically supplementing matching prompts with external, domain-specific knowledge. The MAKA ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14637v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14637v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14637v1-abstract-full" style="display: none;"> Matching patients effectively and efficiently for clinical trials is a significant challenge due to the complexity and variability of patient profiles and trial criteria. This paper presents a novel framework, Multi-Agents for Knowledge Augmentation (MAKA), designed to enhance patient-trial matching by dynamically supplementing matching prompts with external, domain-specific knowledge. The MAKA architecture consists of five key components: a knowledge probing agent that detects gaps in domain knowledge, a navigation agent that manages interactions among multiple specialized knowledge augmentation agents, a knowledge augmentation agent that incorporates relevant information into patient-trial matching prompts, a supervision agent aligning the outputs from other agents with the instructions and a matching agent making the final selection decision. This approach enhances the accuracy and contextual richness of patient matching, addresses inherent knowledge gaps in both trail criteria and large language models (LLMs), and improves the alignment between patient characteristics and the criteria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14637v1-abstract-full').style.display = 'none'; document.getElementById('2411.14637v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14594">arXiv:2411.14594</a> <span> [<a href="https://arxiv.org/pdf/2411.14594">pdf</a>, <a href="https://arxiv.org/format/2411.14594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Solving Zero-Shot 3D Visual Grounding as Constraint Satisfaction Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Q">Qihao Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kailai Li</a>, <a href="/search/cs?searchtype=author&query=Stiefelhagen%2C+R">Rainer Stiefelhagen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14594v1-abstract-short" style="display: inline;"> 3D visual grounding (3DVG) aims to locate objects in a 3D scene with natural language descriptions. Supervised methods have achieved decent accuracy, but have a closed vocabulary and limited language understanding ability. Zero-shot methods mostly utilize large language models (LLMs) to handle natural language descriptions, yet suffer from slow inference speed. To address these problems, in this w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14594v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14594v1-abstract-full" style="display: none;"> 3D visual grounding (3DVG) aims to locate objects in a 3D scene with natural language descriptions. Supervised methods have achieved decent accuracy, but have a closed vocabulary and limited language understanding ability. Zero-shot methods mostly utilize large language models (LLMs) to handle natural language descriptions, yet suffer from slow inference speed. To address these problems, in this work, we propose a zero-shot method that reformulates the 3DVG task as a Constraint Satisfaction Problem (CSP), where the variables and constraints represent objects and their spatial relations, respectively. This allows a global reasoning of all relevant objects, producing grounding results of both the target and anchor objects. Moreover, we demonstrate the flexibility of our framework by handling negation- and counting-based queries with only minor extra coding efforts. Our system, Constraint Satisfaction Visual Grounding (CSVG), has been extensively evaluated on the public datasets ScanRefer and Nr3D datasets using only open-source LLMs. Results show the effectiveness of CSVG and superior grounding accuracy over current state-of-the-art zero-shot 3DVG methods with improvements of $+7.0\%$ (Acc@0.5 score) and $+11.2\%$ on the ScanRefer and Nr3D datasets, respectively. The code of our system is publicly available at https://github.com/sunsleaf/CSVG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14594v1-abstract-full').style.display = 'none'; document.getElementById('2411.14594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14572">arXiv:2411.14572</a> <span> [<a href="https://arxiv.org/pdf/2411.14572">pdf</a>, <a href="https://arxiv.org/format/2411.14572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Knowledge Checking in Retrieval-augmented Generation: A Representation Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shenglai Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiankun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingheng Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuping Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Everaert%2C+D">Dante Everaert</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hanqing Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yue Xing</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M+X">Monica Xiao Cheng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiliang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14572v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) systems have shown promise in enhancing the performance of Large Language Models (LLMs). However, these systems face challenges in effectively integrating external knowledge with the LLM's internal knowledge, often leading to issues with misleading or unhelpful information. This work aims to provide a systematic study on knowledge checking in RAG systems. We co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14572v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14572v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) systems have shown promise in enhancing the performance of Large Language Models (LLMs). However, these systems face challenges in effectively integrating external knowledge with the LLM's internal knowledge, often leading to issues with misleading or unhelpful information. This work aims to provide a systematic study on knowledge checking in RAG systems. We conduct a comprehensive analysis of LLM representation behaviors and demonstrate the significance of using representations in knowledge checking. Motivated by the findings, we further develop representation-based classifiers for knowledge filtering. We show substantial improvements in RAG performance, even when dealing with noisy knowledge databases. Our study provides new insights into leveraging LLM representations for enhancing the reliability and effectiveness of RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14572v1-abstract-full').style.display = 'none'; document.getElementById('2411.14572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14384">arXiv:2411.14384</a> <span> [<a href="https://arxiv.org/pdf/2411.14384">pdf</a>, <a href="https://arxiv.org/format/2411.14384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Baking Gaussian Splatting into Diffusion Denoiser for Fast and Scalable Single-stage Image-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14384v2-abstract-short" style="display: inline;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14384v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14384v2-abstract-full" style="display: none;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly outputs 3D Gaussian point clouds at each timestep to enforce view consistency and allow the model to generate robustly given prompt views of any directions, beyond object-centric inputs. Plus, to improve the capability and generalization ability of DiffusionGS, we scale up 3D training data by developing a scene-object mixed training strategy. Experiments show that our method enjoys better generation quality (2.20 dB higher in PSNR and 23.25 lower in FID) and over 5x faster speed (~6s on an A100 GPU) than SOTA methods. The user study and text-to-3D applications also reveals the practical values of our method. Our Project page at https://caiyuanhao1998.github.io/project/DiffusionGS/ shows the video and interactive generation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v2-abstract-full').style.display = 'none'; document.getElementById('2411.14384v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A novel one-stage 3DGS-based diffusion generates objects and scenes from a single view in ~6 seconds</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14296">arXiv:2411.14296</a> <span> [<a href="https://arxiv.org/pdf/2411.14296">pdf</a>, <a href="https://arxiv.org/format/2411.14296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Routability Prediction via NAS Using a Smooth One-shot Augmented Predictor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sridhar%2C+A">Arjun Sridhar</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Chen-Chia Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14296v1-abstract-short" style="display: inline;"> Routability optimization in modern EDA tools has benefited greatly from using machine learning (ML) models. Constructing and optimizing the performance of ML models continues to be a challenge. Neural Architecture Search (NAS) serves as a tool to aid in the construction and improvement of these models. Traditional NAS techniques struggle to perform well on routability prediction as a result of two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14296v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14296v1-abstract-full" style="display: none;"> Routability optimization in modern EDA tools has benefited greatly from using machine learning (ML) models. Constructing and optimizing the performance of ML models continues to be a challenge. Neural Architecture Search (NAS) serves as a tool to aid in the construction and improvement of these models. Traditional NAS techniques struggle to perform well on routability prediction as a result of two primary factors. First, the separation between the training objective and the search objective adds noise to the NAS process. Secondly, the increased variance of the search objective further complicates performing NAS. We craft a novel NAS technique, coined SOAP-NAS, to address these challenges through novel data augmentation techniques and a novel combination of one-shot and predictor-based NAS. Results show that our technique outperforms existing solutions by 40% closer to the ideal performance measured by ROC-AUC (area under the receiver operating characteristic curve) in DRC hotspot detection. SOAPNet is able to achieve an ROC-AUC of 0.9802 and a query time of only 0.461 ms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14296v1-abstract-full').style.display = 'none'; document.getElementById('2411.14296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13842">arXiv:2411.13842</a> <span> [<a href="https://arxiv.org/pdf/2411.13842">pdf</a>, <a href="https://arxiv.org/format/2411.13842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detecting Human Artifacts from Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaihong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13842v1-abstract-short" style="display: inline;"> Despite recent advancements, text-to-image generation models often produce images containing artifacts, especially in human figures. These artifacts appear as poorly generated human bodies, including distorted, missing, or extra body parts, leading to visual inconsistencies with typical human anatomy and greatly impairing overall fidelity. In this study, we address this challenge by curating Human… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13842v1-abstract-full" style="display: none;"> Despite recent advancements, text-to-image generation models often produce images containing artifacts, especially in human figures. These artifacts appear as poorly generated human bodies, including distorted, missing, or extra body parts, leading to visual inconsistencies with typical human anatomy and greatly impairing overall fidelity. In this study, we address this challenge by curating Human Artifact Dataset (HAD), the first large-scale dataset specifically designed to identify and localize human artifacts. HAD comprises over 37,000 images generated by several popular text-to-image models, annotated for human artifact localization. Using this dataset, we train the Human Artifact Detection Models (HADM), which can identify diverse artifact types across multiple generative domains and demonstrate strong generalization, even on images from unseen generators. Additionally, to further improve generators' perception of human structural coherence, we use the predictions from our HADM as feedback for diffusion model finetuning. Our experiments confirm a reduction in human artifacts in the resulting model. Furthermore, we showcase a novel application of our HADM in an iterative inpainting framework to correct human artifacts in arbitrary images directly, demonstrating its utility in improving image quality. Our dataset and detection models are available at: \url{https://github.com/wangkaihong/HADM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13842v1-abstract-full').style.display = 'none'; document.getElementById('2411.13842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13715">arXiv:2411.13715</a> <span> [<a href="https://arxiv.org/pdf/2411.13715">pdf</a>, <a href="https://arxiv.org/format/2411.13715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SimPhony: A Device-Circuit-Architecture Cross-Layer Modeling and Simulation Framework for Heterogeneous Electronic-Photonic AI System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Begovic%2C+A">Amir Begovic</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rena Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13715v1-abstract-short" style="display: inline;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13715v1-abstract-full" style="display: none;"> Electronic-photonic integrated circuits (EPICs) offer transformative potential for next-generation high-performance AI but require interdisciplinary advances across devices, circuits, architecture, and design automation. The complexity of hybrid systems makes it challenging even for domain experts to understand distinct behaviors and interactions across design stack. The lack of a flexible, accurate, fast, and easy-to-use EPIC AI system simulation framework significantly limits the exploration of hardware innovations and system evaluations on common benchmarks. To address this gap, we propose SimPhony, a cross-layer modeling and simulation framework for heterogeneous electronic-photonic AI systems. SimPhony offers a platform that enables (1) generic, extensible hardware topology representation that supports heterogeneous multi-core architectures with diverse photonic tensor core designs; (2) optics-specific dataflow modeling with unique multi-dimensional parallelism and reuse beyond spatial/temporal dimensions; (3) data-aware energy modeling with realistic device responses, layout-aware area estimation, link budget analysis, and bandwidth-adaptive memory modeling; and (4) seamless integration with model training framework for hardware/software co-simulation. By providing a unified, versatile, and high-fidelity simulation platform, SimPhony enables researchers to innovate and evaluate EPIC AI hardware across multiple domains, facilitating the next leap in emerging AI hardware. We open-source our codes at https://github.com/ScopeX-ASU/SimPhony <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13715v1-abstract-full').style.display = 'none'; document.getElementById('2411.13715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13580">arXiv:2411.13580</a> <span> [<a href="https://arxiv.org/pdf/2411.13580">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.autcon.2017.06.021">10.1016/j.autcon.2017.06.021 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Multi-Server Information-Sharing Environment for Cross-Party Collaboration on A Private Cloud </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianping Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhenzhong Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiarui Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fangqiang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13580v1-abstract-short" style="display: inline;"> Interoperability remains the key problem in multi-discipline collaboration based on building information modeling (BIM). Although various methods have been proposed to solve the technical issues of interoperability, such as data sharing and data consistency; organizational issues, including data ownership and data privacy, remain unresolved to date. These organizational issues prevent different st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13580v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13580v1-abstract-full" style="display: none;"> Interoperability remains the key problem in multi-discipline collaboration based on building information modeling (BIM). Although various methods have been proposed to solve the technical issues of interoperability, such as data sharing and data consistency; organizational issues, including data ownership and data privacy, remain unresolved to date. These organizational issues prevent different stakeholders from sharing their data due to concerns regarding losing control of the data. This study proposes a multi-server information-sharing approach on a private cloud after analyzing the requirements for cross-party collaboration to address the aforementioned issues and prepare for massive data handling in the near future. This approach adopts a global controller to track the location, ownership and privacy of the data, which are stored in different servers that are controlled by different parties. Furthermore, data consistency conventions, parallel sub-model extraction, and sub-model integration with model verification are investigated in depth to support information sharing in a distributed environment and to maintain data consistency. Thus, with this approach, the ownership and privacy of the data can be controlled by its owner while still enabling certain required data to be shared with other parties. Application of the multi-server approach for information interoperability and cross-party collaboration is illustrated using a real construction project of an airport terminal. Validation shows that the proposed approach is feasible for maintaining the ownership and privacy of the data while supporting cross-party data sharing and collaboration at the same time, thus avoiding possible legal problems regarding data copyrights or other legal issues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13580v1-abstract-full').style.display = 'none'; document.getElementById('2411.13580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Automation in Construction,2017 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13547">arXiv:2411.13547</a> <span> [<a href="https://arxiv.org/pdf/2411.13547">pdf</a>, <a href="https://arxiv.org/format/2411.13547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecTool: A Benchmark for Characterizing Errors in Tool-Use LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Awalgaonkar%2C+T">Tulika Awalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silivo Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13547v1-abstract-short" style="display: inline;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13547v1-abstract-full" style="display: none;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only give a success rate without any explanation of the failure cases. To solve this problem, we introduce SpecTool, a new benchmark to identify error patterns in LLM output on tool-use tasks. Our benchmark data set comprises of queries from diverse environments that can be used to test for the presence of seven newly characterized error patterns. Using SPECTOOL , we show that even the most prominent LLMs exhibit these error patterns in their outputs. Researchers can use the analysis and insights from SPECTOOL to guide their error mitigation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'none'; document.getElementById('2411.13547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13383">arXiv:2411.13383</a> <span> [<a href="https://arxiv.org/pdf/2411.13383">pdf</a>, <a href="https://arxiv.org/format/2411.13383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Diffusion Compression for Real-World Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gehui Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Rongyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xindong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13383v1-abstract-short" style="display: inline;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13383v1-abstract-full" style="display: none;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still incur high computational costs due to their reliance on large pretrained SD models. This paper proposes a novel Real-ISR method, AdcSR, by distilling the one-step diffusion network OSEDiff into a streamlined diffusion-GAN model under our Adversarial Diffusion Compression (ADC) framework. We meticulously examine the modules of OSEDiff, categorizing them into two types: (1) Removable (VAE encoder, prompt extractor, text encoder, etc.) and (2) Prunable (denoising UNet and VAE decoder). Since direct removal and pruning can degrade the model's generation capability, we pretrain our pruned VAE decoder to restore its ability to decode images and employ adversarial distillation to compensate for performance loss. This ADC-based diffusion-GAN hybrid design effectively reduces complexity by 73% in inference time, 78% in computation, and 74% in parameters, while preserving the model's generation capability. Experiments manifest that our proposed AdcSR achieves competitive recovery quality on both synthetic and real-world datasets, offering up to 9.3$\times$ speedup over previous one-step diffusion-based methods. Code and models will be made available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'none'; document.getElementById('2411.13383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13314">arXiv:2411.13314</a> <span> [<a href="https://arxiv.org/pdf/2411.13314">pdf</a>, <a href="https://arxiv.org/format/2411.13314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> I2TTS: Image-indicated Immersive Text-to-speech Synthesis with Spatial Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tian-Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiaran Gao</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+X">Xinyuan Qian</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xu-Cheng Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13314v1-abstract-short" style="display: inline;"> Controlling the style and characteristics of speech synthesis is crucial for adapting the output to specific contexts and user requirements. Previous Text-to-speech (TTS) works have focused primarily on the technical aspects of producing natural-sounding speech, such as intonation, rhythm, and clarity. However, they overlook the fact that there is a growing emphasis on spatial perception of synthe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13314v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13314v1-abstract-full" style="display: none;"> Controlling the style and characteristics of speech synthesis is crucial for adapting the output to specific contexts and user requirements. Previous Text-to-speech (TTS) works have focused primarily on the technical aspects of producing natural-sounding speech, such as intonation, rhythm, and clarity. However, they overlook the fact that there is a growing emphasis on spatial perception of synthesized speech, which may provide immersive experience in gaming and virtual reality. To solve this issue, in this paper, we present a novel multi-modal TTS approach, namely Image-indicated Immersive Text-to-speech Synthesis (I2TTS). Specifically, we introduce a scene prompt encoder that integrates visual scene prompts directly into the synthesis pipeline to control the speech generation process. Additionally, we propose a reverberation classification and refinement technique that adjusts the synthesized mel-spectrogram to enhance the immersive experience, ensuring that the involved reverberation condition matches the scene accurately. Experimental results demonstrate that our model achieves high-quality scene and spatial matching without compromising speech naturalness, marking a significant advancement in the field of context-aware speech synthesis. Project demo page: https://spatialTTS.github.io/ Index Terms-Speech synthesis, scene prompt, spatial perception <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13314v1-abstract-full').style.display = 'none'; document.getElementById('2411.13314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages,4figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13136">arXiv:2411.13136</a> <span> [<a href="https://arxiv.org/pdf/2411.13136">pdf</a>, <a href="https://arxiv.org/format/2411.13136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TAPT: Test-Time Adversarial Prompt Tuning for Robust Inference in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13136v1-abstract-short" style="display: inline;"> Large pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated excellent zero-shot generalizability across various downstream tasks. However, recent studies have shown that the inference performance of CLIP can be greatly degraded by small adversarial perturbations, especially its visual modality, posing significant safety threats. To mitigate this vulnerability, in this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13136v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13136v1-abstract-full" style="display: none;"> Large pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated excellent zero-shot generalizability across various downstream tasks. However, recent studies have shown that the inference performance of CLIP can be greatly degraded by small adversarial perturbations, especially its visual modality, posing significant safety threats. To mitigate this vulnerability, in this paper, we propose a novel defense method called Test-Time Adversarial Prompt Tuning (TAPT) to enhance the inference robustness of CLIP against visual adversarial attacks. TAPT is a test-time defense method that learns defensive bimodal (textual and visual) prompts to robustify the inference process of CLIP. Specifically, it is an unsupervised method that optimizes the defensive prompts for each test sample by minimizing a multi-view entropy and aligning adversarial-clean distributions. We evaluate the effectiveness of TAPT on 11 benchmark datasets, including ImageNet and 10 other zero-shot datasets, demonstrating that it enhances the zero-shot adversarial robustness of the original CLIP by at least 48.9% against AutoAttack (AA), while largely maintaining performance on clean examples. Moreover, TAPT outperforms existing adversarial prompt tuning methods across various backbones, achieving an average robustness improvement of at least 36.6%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13136v1-abstract-full').style.display = 'none'; document.getElementById('2411.13136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13081">arXiv:2411.13081</a> <span> [<a href="https://arxiv.org/pdf/2411.13081">pdf</a>, <a href="https://arxiv.org/format/2411.13081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Practical Compact Deep Compressed Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13081v1-abstract-short" style="display: inline;"> Recent years have witnessed the success of deep networks in compressed sensing (CS), which allows for a significant reduction in sampling cost and has gained growing attention since its inception. In this paper, we propose a new practical and compact network dubbed PCNet for general image CS. Specifically, in PCNet, a novel collaborative sampling operator is designed, which consists of a deep cond… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13081v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13081v1-abstract-full" style="display: none;"> Recent years have witnessed the success of deep networks in compressed sensing (CS), which allows for a significant reduction in sampling cost and has gained growing attention since its inception. In this paper, we propose a new practical and compact network dubbed PCNet for general image CS. Specifically, in PCNet, a novel collaborative sampling operator is designed, which consists of a deep conditional filtering step and a dual-branch fast sampling step. The former learns an implicit representation of a linear transformation matrix into a few convolutions and first performs adaptive local filtering on the input image, while the latter then uses a discrete cosine transform and a scrambled block-diagonal Gaussian matrix to generate under-sampled measurements. Our PCNet is equipped with an enhanced proximal gradient descent algorithm-unrolled network for reconstruction. It offers flexibility, interpretability, and strong recovery performance for arbitrary sampling rates once trained. Additionally, we provide a deployment-oriented extraction scheme for single-pixel CS imaging systems, which allows for the convenient conversion of any linear sampling operator to its matrix form to be loaded onto hardware like digital micro-mirror devices. Extensive experiments on natural image CS, quantized CS, and self-supervised CS demonstrate the superior reconstruction accuracy and generalization ability of PCNet compared to existing state-of-the-art methods, particularly for high-resolution images. Code is available at https://github.com/Guaishou74851/PCNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13081v1-abstract-full').style.display = 'none'; document.getElementById('2411.13081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE T-PAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12196">arXiv:2411.12196</a> <span> [<a href="https://arxiv.org/pdf/2411.12196">pdf</a>, <a href="https://arxiv.org/format/2411.12196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A More Advanced Group Polarization Measurement Approach Based on LLM-Based Agents and Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zixin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yiran Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12196v1-abstract-short" style="display: inline;"> Group polarization is an important research direction in social media content analysis, attracting many researchers to explore this field. Therefore, how to effectively measure group polarization has become a critical topic. Measuring group polarization on social media presents several challenges that have not yet been addressed by existing solutions. First, social media group polarization measure… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12196v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12196v1-abstract-full" style="display: none;"> Group polarization is an important research direction in social media content analysis, attracting many researchers to explore this field. Therefore, how to effectively measure group polarization has become a critical topic. Measuring group polarization on social media presents several challenges that have not yet been addressed by existing solutions. First, social media group polarization measurement involves processing vast amounts of text, which poses a significant challenge for information extraction. Second, social media texts often contain hard-to-understand content, including sarcasm, memes, and internet slang. Additionally, group polarization research focuses on holistic analysis, while texts is typically fragmented. To address these challenges, we designed a solution based on a multi-agent system and used a graph-structured Community Sentiment Network (CSN) to represent polarization states. Furthermore, we developed a metric called Community Opposition Index (COI) based on the CSN to quantify polarization. Finally, we tested our multi-agent system through a zero-shot stance detection task and achieved outstanding results. In summary, the proposed approach has significant value in terms of usability, accuracy, and interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12196v1-abstract-full').style.display = 'none'; document.getElementById('2411.12196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12183">arXiv:2411.12183</a> <span> [<a href="https://arxiv.org/pdf/2411.12183">pdf</a>, <a href="https://arxiv.org/format/2411.12183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of Beamlines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyu Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shengran Dai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jianhui Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yufei Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junbin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12183v1-abstract-short" style="display: inline;"> Synchrotron radiation sources play a crucial role in fields such as materials science, biology, and chemistry. The beamline, a key subsystem of the synchrotron, modulates and directs the radiation to the sample for analysis. However, the alignment of beamlines is a complex and time-consuming process, primarily carried out manually by experienced engineers. Even minor misalignments in optical compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12183v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12183v1-abstract-full" style="display: none;"> Synchrotron radiation sources play a crucial role in fields such as materials science, biology, and chemistry. The beamline, a key subsystem of the synchrotron, modulates and directs the radiation to the sample for analysis. However, the alignment of beamlines is a complex and time-consuming process, primarily carried out manually by experienced engineers. Even minor misalignments in optical components can significantly affect the beam's properties, leading to suboptimal experimental outcomes. Current automated methods, such as bayesian optimization (BO) and reinforcement learning (RL), although these methods enhance performance, limitations remain. The relationship between the current and target beam properties, crucial for determining the adjustment, is not fully considered. Additionally, the physical characteristics of optical elements are overlooked, such as the need to adjust specific devices to control the output beam's spot size or position. This paper addresses the alignment of beamlines by modeling it as a Markov Decision Process (MDP) and training an intelligent agent using RL. The agent calculates adjustment values based on the current and target beam states, executes actions, and iterates until optimal parameters are achieved. A policy network with action attention is designed to improve decision-making by considering both state differences and the impact of optical components. Experiments on two simulated beamlines demonstrate that our algorithm outperforms existing methods, with ablation studies highlighting the effectiveness of the action attention-based policy network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12183v1-abstract-full').style.display = 'none'; document.getElementById('2411.12183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12181">arXiv:2411.12181</a> <span> [<a href="https://arxiv.org/pdf/2411.12181">pdf</a>, <a href="https://arxiv.org/format/2411.12181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low Dose Computed Tomography Images Using Consistency Training Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gokmen%2C+M+S">Mahmut S. Gokmen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jin Chen</a>, <a href="/search/cs?searchtype=author&query=Bumgardner%2C+C">Cody Bumgardner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12181v1-abstract-short" style="display: inline;"> Diffusion models have significant impact on wide range of generative tasks, especially on image inpainting and restoration. Although the improvements on aiming for decreasing number of function evaluations (NFE), the iterative results are still computationally expensive. Consistency models are as a new family of generative models, enable single-step sampling of high quality data without the need f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12181v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12181v1-abstract-full" style="display: none;"> Diffusion models have significant impact on wide range of generative tasks, especially on image inpainting and restoration. Although the improvements on aiming for decreasing number of function evaluations (NFE), the iterative results are still computationally expensive. Consistency models are as a new family of generative models, enable single-step sampling of high quality data without the need for adversarial training. In this paper, we introduce the beta noise distribution, which provides flexibility in adjusting noise levels. This is combined with a sinusoidal curriculum that enhances the learning of the trajectory between the noise distribution and the posterior distribution of interest, allowing High Noise Improved Consistency Training (HN-iCT) to be trained in a supervised fashion. Additionally, High Noise Improved Consistency Training with Image Condition (HN-iCT-CN) architecture is introduced, enables to take Low Dose images as a condition for extracting significant features by Weighted Attention Gates (WAG).Our results indicate that unconditional image generation using HN-iCT significantly outperforms basic CT and iCT training techniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our image-conditioned model demonstrates exceptional performance in enhancing low-dose (LD) CT scans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12181v1-abstract-full').style.display = 'none'; document.getElementById('2411.12181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11909">arXiv:2411.11909</a> <span> [<a href="https://arxiv.org/pdf/2411.11909">pdf</a>, <a href="https://arxiv.org/format/2411.11909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SymDPO: Boosting In-Context Learning of Large Multimodal Models with Symbol Demonstration Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+H">Hongrui Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chaoya Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wei Ye</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Mengfan Dong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shikun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11909v2-abstract-short" style="display: inline;"> As language models continue to scale, Large Language Models (LLMs) have exhibited emerging capabilities in In-Context Learning (ICL), enabling them to solve language tasks by prefixing a few in-context demonstrations (ICDs) as context. Inspired by these advancements, researchers have extended these techniques to develop Large Multimodal Models (LMMs) with ICL capabilities. However, existing LMMs f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11909v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11909v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11909v2-abstract-full" style="display: none;"> As language models continue to scale, Large Language Models (LLMs) have exhibited emerging capabilities in In-Context Learning (ICL), enabling them to solve language tasks by prefixing a few in-context demonstrations (ICDs) as context. Inspired by these advancements, researchers have extended these techniques to develop Large Multimodal Models (LMMs) with ICL capabilities. However, existing LMMs face a critical issue: they often fail to effectively leverage the visual context in multimodal demonstrations and instead simply follow textual patterns. This indicates that LMMs do not achieve effective alignment between multimodal demonstrations and model outputs. To address this problem, we propose Symbol Demonstration Direct Preference Optimization (SymDPO). Specifically, SymDPO aims to break the traditional paradigm of constructing multimodal demonstrations by using random symbols to replace text answers within instances. This forces the model to carefully understand the demonstration images and establish a relationship between the images and the symbols to answer questions correctly. We validate the effectiveness of this method on multiple benchmarks, demonstrating that with SymDPO, LMMs can more effectively understand the multimodal context within examples and utilize this knowledge to answer questions better. Code is available at https://github.com/APiaoG/SymDPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11909v2-abstract-full').style.display = 'none'; document.getElementById('2411.11909v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11762">arXiv:2411.11762</a> <span> [<a href="https://arxiv.org/pdf/2411.11762">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous Electric Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyue Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Masoud%2C+N">Neda Masoud</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuhong Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heye Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11762v2-abstract-short" style="display: inline;"> Executing drift maneuvers during high-speed cornering presents significant challenges for autonomous vehicles, yet offers the potential to minimize turning time and enhance driving dynamics. While reinforcement learning (RL) has shown promising results in simulated environments, discrepancies between simulations and real-world conditions have limited its practical deployment. This study introduces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11762v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11762v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11762v2-abstract-full" style="display: none;"> Executing drift maneuvers during high-speed cornering presents significant challenges for autonomous vehicles, yet offers the potential to minimize turning time and enhance driving dynamics. While reinforcement learning (RL) has shown promising results in simulated environments, discrepancies between simulations and real-world conditions have limited its practical deployment. This study introduces an innovative control framework that integrates trajectory optimization with drift maneuvers, aiming to improve the algorithm's adaptability for real-vehicle implementation. We leveraged Bezier-based pre-trajectory optimization to enhance rewards and optimize the controller through Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated environment. For real-world deployment, we implement a hybrid RL-MPC fusion mechanism, , where TD3-derived maneuvers serve as primary inputs for a Model Predictive Controller (MPC). This integration enables precise real-time tracking of the optimal trajectory, with MPC providing corrective inputs to bridge the gap between simulation and reality. The efficacy of this method is validated through real-vehicle tests on consumer-grade electric vehicles, focusing on drift U-turns and drift right-angle turns. The control outcomes of these real-vehicle tests are thoroughly documented in the paper, supported by supplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this study is the first to deploy and apply an RL-based transient drift cornering algorithm on consumer-grade electric vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11762v2-abstract-full').style.display = 'none'; document.getElementById('2411.11762v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In the process of being submitted to the Journal of IEEE Transactions on Industrial Electronics</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository