Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 605 results for author: <span class="mathjax">Su, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Su%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Su, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Su%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Su, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Su%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14499">arXiv:2411.14499</a> <span> [<a href="https://arxiv.org/pdf/2411.14499">pdf</a>, <a href="https://arxiv.org/format/2411.14499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Understanding World or Predicting Future? A Comprehensive Survey of World Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jingtao Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunke Zhang</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yu Shang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+Z">Zefang Zong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jie Feng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongyuan Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nian Li</a>, <a href="/search/cs?searchtype=author&query=Sukiennik%2C+N">Nicholas Sukiennik</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengli Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14499v1-abstract-short" style="display: inline;"> The concept of world models has garnered significant attention due to advancements in multimodal large language models such as GPT-4 and video generation models such as Sora, which are central to the pursuit of artificial general intelligence. This survey offers a comprehensive review of the literature on world models. Generally, world models are regarded as tools for either understanding the pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14499v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14499v1-abstract-full" style="display: none;"> The concept of world models has garnered significant attention due to advancements in multimodal large language models such as GPT-4 and video generation models such as Sora, which are central to the pursuit of artificial general intelligence. This survey offers a comprehensive review of the literature on world models. Generally, world models are regarded as tools for either understanding the present state of the world or predicting its future dynamics. This review presents a systematic categorization of world models, emphasizing two primary functions: (1) constructing internal representations to understand the mechanisms of the world, and (2) predicting future states to simulate and guide decision-making. Initially, we examine the current progress in these two categories. We then explore the application of world models in key domains, including autonomous driving, robotics, and social simulacra, with a focus on how each domain utilizes these aspects. Finally, we outline key challenges and provide insights into potential future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14499v1-abstract-full').style.display = 'none'; document.getElementById('2411.14499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13152">arXiv:2411.13152</a> <span> [<a href="https://arxiv.org/pdf/2411.13152">pdf</a>, <a href="https://arxiv.org/format/2411.13152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AGLP: A Graph Learning Perspective for Semi-supervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Houcheng Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengzhu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiao Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+N">Nan Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liang Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13152v2-abstract-short" style="display: inline;"> In semi-supervised domain adaptation (SSDA), the model aims to leverage partially labeled target domain data along with a large amount of labeled source domain data to enhance its generalization capability for the target domain. A key advantage of SSDA is its ability to significantly reduce reliance on labeled data, thereby lowering the costs and time associated with data preparation. Most existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13152v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13152v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13152v2-abstract-full" style="display: none;"> In semi-supervised domain adaptation (SSDA), the model aims to leverage partially labeled target domain data along with a large amount of labeled source domain data to enhance its generalization capability for the target domain. A key advantage of SSDA is its ability to significantly reduce reliance on labeled data, thereby lowering the costs and time associated with data preparation. Most existing SSDA methods utilize information from domain labels and class labels but overlook the structural information of the data. To address this issue, this paper proposes a graph learning perspective (AGLP) for semi-supervised domain adaptation. We apply the graph convolutional network to the instance graph which allows structural information to propagate along the weighted graph edges. The proposed AGLP model has several advantages. First, to the best of our knowledge, this is the first work to model structural information in SSDA. Second, the proposed model can effectively learn domain-invariant and semantic representations, reducing domain discrepancies in SSDA. Extensive experimental results on multiple standard benchmarks demonstrate that the proposed AGLP algorithm outperforms state-of-the-art semi-supervised domain adaptation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13152v2-abstract-full').style.display = 'none'; document.getElementById('2411.13152v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8page</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 92C55; 62H35 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.10; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13147">arXiv:2411.13147</a> <span> [<a href="https://arxiv.org/pdf/2411.13147">pdf</a>, <a href="https://arxiv.org/format/2411.13147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GraphCL: Graph-based Clustering for Semi-Supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengzhu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiao Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Houcheng Su</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+N">Nan Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liang Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13147v2-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) has made notable advancements in medical image segmentation (MIS), particularly in scenarios with limited labeled data and significantly enhancing data utilization efficiency. Previous methods primarily focus on complex training strategies to utilize unlabeled data but neglect the importance of graph structural information. Different from existing methods, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13147v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13147v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13147v2-abstract-full" style="display: none;"> Semi-supervised learning (SSL) has made notable advancements in medical image segmentation (MIS), particularly in scenarios with limited labeled data and significantly enhancing data utilization efficiency. Previous methods primarily focus on complex training strategies to utilize unlabeled data but neglect the importance of graph structural information. Different from existing methods, we propose a graph-based clustering for semi-supervised medical image segmentation (GraphCL) by jointly modeling graph data structure in a unified deep model. The proposed GraphCL model enjoys several advantages. Firstly, to the best of our knowledge, this is the first work to model the data structure information for semi-supervised medical image segmentation (SSMIS). Secondly, to get the clustered features across different graphs, we integrate both pairwise affinities between local image features and raw features as inputs. Extensive experimental results on three standard benchmarks show that the proposed GraphCL algorithm outperforms state-of-the-art semi-supervised medical image segmentation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13147v2-abstract-full').style.display = 'none'; document.getElementById('2411.13147v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9page</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 92C55; 62H35 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.10; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12503">arXiv:2411.12503</a> <span> [<a href="https://arxiv.org/pdf/2411.12503">pdf</a>, <a href="https://arxiv.org/format/2411.12503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ManiSkill-ViTac 2025: Challenge on Manipulation Skill Learning With Vision and Tactile Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanyu Li</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+R">Renjun Dang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Kasaei%2C+H">Hamidreza Kasaei</a>, <a href="/search/cs?searchtype=author&query=Calandra%2C+R">Roberto Calandra</a>, <a href="/search/cs?searchtype=author&query=Lepora%2C+N">Nathan Lepora</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12503v1-abstract-short" style="display: inline;"> This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on learning contact-rich manipulation skills using both tactile and visual sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 independent tracks: tactile manipulation, tactile-vision fusion manipulation, and tactile sensor structure design. The challenge aims to push the boundaries of robotic manipul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12503v1-abstract-full" style="display: none;"> This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on learning contact-rich manipulation skills using both tactile and visual sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 independent tracks: tactile manipulation, tactile-vision fusion manipulation, and tactile sensor structure design. The challenge aims to push the boundaries of robotic manipulation skills, emphasizing the integration of tactile and visual data to enhance performance in complex, real-world tasks. Participants will be evaluated using standardized metrics across both simulated and real-world environments, spurring innovations in sensor design and significantly advancing the field of vision-tactile fusion in robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12503v1-abstract-full').style.display = 'none'; document.getElementById('2411.12503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Challenge webpage: https://ai-workshops.github.io/maniskill-vitac-challenge-2025/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12350">arXiv:2411.12350</a> <span> [<a href="https://arxiv.org/pdf/2411.12350">pdf</a>, <a href="https://arxiv.org/format/2411.12350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DiM: $f$-Divergence Minimization Guided Sharpness-Aware Optimization for Semi-supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingli Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Houcheng Su</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+N">Nan Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengzhu Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12350v1-abstract-short" style="display: inline;"> As a technique to alleviate the pressure of data annotation, semi-supervised learning (SSL) has attracted widespread attention. In the specific domain of medical image segmentation, semi-supervised methods (SSMIS) have become a research hotspot due to their ability to reduce the need for large amounts of precisely annotated data. SSMIS focuses on enhancing the model's generalization performance by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12350v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12350v1-abstract-full" style="display: none;"> As a technique to alleviate the pressure of data annotation, semi-supervised learning (SSL) has attracted widespread attention. In the specific domain of medical image segmentation, semi-supervised methods (SSMIS) have become a research hotspot due to their ability to reduce the need for large amounts of precisely annotated data. SSMIS focuses on enhancing the model's generalization performance by leveraging a small number of labeled samples and a large number of unlabeled samples. The latest sharpness-aware optimization (SAM) technique, which optimizes the model by reducing the sharpness of the loss function, has shown significant success in SSMIS. However, SAM and its variants may not fully account for the distribution differences between different datasets. To address this issue, we propose a sharpness-aware optimization method based on $f$-divergence minimization (DiM) for semi-supervised medical image segmentation. This method enhances the model's stability by fine-tuning the sensitivity of model parameters and improves the model's adaptability to different datasets through the introduction of $f$-divergence. By reducing $f$-divergence, the DiM method not only improves the performance balance between the source and target datasets but also prevents performance degradation due to overfitting on the source dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12350v1-abstract-full').style.display = 'none'; document.getElementById('2411.12350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8page</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 92C55; 62H35 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.10; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10498">arXiv:2411.10498</a> <span> [<a href="https://arxiv.org/pdf/2411.10498">pdf</a>, <a href="https://arxiv.org/format/2411.10498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Guided Environmentally Consistent Adversarial Patch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaoqun Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huanqian Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tairan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuodong Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10498v1-abstract-short" style="display: inline;"> Adversarial attacks in the physical world pose a significant threat to the security of vision-based systems, such as facial recognition and autonomous driving. Existing adversarial patch methods primarily focus on improving attack performance, but they often produce patches that are easily detectable by humans and struggle to achieve environmental consistency, i.e., blending patches into the envir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10498v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10498v1-abstract-full" style="display: none;"> Adversarial attacks in the physical world pose a significant threat to the security of vision-based systems, such as facial recognition and autonomous driving. Existing adversarial patch methods primarily focus on improving attack performance, but they often produce patches that are easily detectable by humans and struggle to achieve environmental consistency, i.e., blending patches into the environment. This paper introduces a novel approach for generating adversarial patches, which addresses both the visual naturalness and environmental consistency of the patches. We propose Prompt-Guided Environmentally Consistent Adversarial Patch (PG-ECAP), a method that aligns the patch with the environment to ensure seamless integration into the environment. The approach leverages diffusion models to generate patches that are both environmental consistency and effective in evading detection. To further enhance the naturalness and consistency, we introduce two alignment losses: Prompt Alignment Loss and Latent Space Alignment Loss, ensuring that the generated patch maintains its adversarial properties while fitting naturally within its environment. Extensive experiments in both digital and physical domains demonstrate that PG-ECAP outperforms existing methods in attack success rate and environmental consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10498v1-abstract-full').style.display = 'none'; document.getElementById('2411.10498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10003">arXiv:2411.10003</a> <span> [<a href="https://arxiv.org/pdf/2411.10003">pdf</a>, <a href="https://arxiv.org/format/2411.10003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Pro-Prophet: A Systematic Load Balancing Method for Efficient Parallel Training of Large-scale MoE Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zhiquan Lai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shengwei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weijie Liu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+K">Keshi Ge</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+A">Ao Shen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Huayou Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10003v2-abstract-short" style="display: inline;"> The size of deep learning models has been increasing to enhance model quality. The linear increase in training computation budget with model size means that training an extremely large-scale model is exceedingly time-consuming. Recently, the Mixture of Expert (MoE) has drawn significant attention as it can scale models to extra-large sizes with a stable computation budget. However, inefficient dis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10003v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10003v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10003v2-abstract-full" style="display: none;"> The size of deep learning models has been increasing to enhance model quality. The linear increase in training computation budget with model size means that training an extremely large-scale model is exceedingly time-consuming. Recently, the Mixture of Expert (MoE) has drawn significant attention as it can scale models to extra-large sizes with a stable computation budget. However, inefficient distributed training of large-scale MoE models hinders their broader application. Specifically, a considerable dynamic load imbalance occurs among devices during training, significantly reducing throughput. Several load-balancing works have been proposed to address the challenge. System-level solutions draw more attention for their hardware affinity and non-disruption of model convergence compared to algorithm-level ones. However, they are troubled by high communication costs and poor communication-computation overlapping. To address these challenges, we propose a systematic load-balancing method, Pro-Prophet, which consists of a planner and a scheduler for efficient parallel training of large-scale MoE models. To adapt to the dynamic load imbalance, we profile training statistics and use them to design Pro-Prophet. For lower communication volume, Pro-Prophet planner determines a series of lightweight load-balancing strategies and efficiently searches for a communication-efficient one for training based on the statistics. For sufficient overlapping of communication and computation, Pro-Prophet scheduler schedules the data-dependent operations based on the statistics and operation features, further improving the training throughput. Experimental results indicate that Pro-Prophet achieves up to 2.66x speedup compared to Deepspeed-MoE and FasterMoE. Additionally, Pro-Prophet achieves a load-balancing enhancement of up to 11.01x when compared to FasterMoE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10003v2-abstract-full').style.display = 'none'; document.getElementById('2411.10003v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09595">arXiv:2411.09595</a> <span> [<a href="https://arxiv.org/pdf/2411.09595">pdf</a>, <a href="https://arxiv.org/format/2411.09595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Lorraine%2C+J">Jonathan Lorraine</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiaohui Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09595v1-abstract-short" style="display: inline;"> This work explores expanding the capabilities of large language models (LLMs) pretrained on text to generate 3D meshes within a unified model. This offers key advantages of (1) leveraging spatial knowledge already embedded in LLMs, derived from textual sources like 3D tutorials, and (2) enabling conversational 3D generation and mesh understanding. A primary challenge is effectively tokenizing 3D m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09595v1-abstract-full" style="display: none;"> This work explores expanding the capabilities of large language models (LLMs) pretrained on text to generate 3D meshes within a unified model. This offers key advantages of (1) leveraging spatial knowledge already embedded in LLMs, derived from textual sources like 3D tutorials, and (2) enabling conversational 3D generation and mesh understanding. A primary challenge is effectively tokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly. To address this, we introduce LLaMA-Mesh, a novel approach that represents the vertex coordinates and face definitions of 3D meshes as plain text, allowing direct integration with LLMs without expanding the vocabulary. We construct a supervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate 3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs as required, and (3) understand and interpret 3D meshes. Our work is the first to demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge for 3D mesh generation in a text-based format, effectively unifying the 3D and text modalities. LLaMA-Mesh achieves mesh generation quality on par with models trained from scratch while maintaining strong text generation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09595v1-abstract-full').style.display = 'none'; document.getElementById('2411.09595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">See the project website at https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.5; I.2.10; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07763">arXiv:2411.07763</a> <span> [<a href="https://arxiv.org/pdf/2411.07763">pdf</a>, <a href="https://arxiv.org/format/2411.07763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+F">Fangyu Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuxiao Ye</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+R">Ruisheng Cao</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+D">Dongchan Shin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongjin Su</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+Z">Zhaoqing Suo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenjing Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+P">Pengcheng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+V">Victor Zhong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sida Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07763v1-abstract-short" style="display: inline;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07763v1-abstract-full" style="display: none;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spider 2.0 are sourced from real data applications, often containing over 1,000 columns and stored in local or cloud database systems such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 frequently requires understanding and searching through database metadata, dialect documentation, and even project-level codebases. This challenge calls for models to interact with complex SQL workflow environments, process extremely long contexts, perform intricate reasoning, and generate multiple SQL queries with diverse operations, often exceeding 100 lines, which goes far beyond traditional text-to-SQL challenges. Our evaluations indicate that based on o1-preview, our code agent framework successfully solves only 17.0% of the tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on Spider 2.0 show that while language models have demonstrated remarkable performance in code generation -- especially in prior text-to-SQL benchmarks -- they require significant improvement in order to achieve adequate performance for real-world enterprise usage. Progress on Spider 2.0 represents crucial steps towards developing intelligent, autonomous, code agents for real-world enterprise settings. Our code, baseline models, and data are available at https://spider2-sql.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'none'; document.getElementById('2411.07763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06272">arXiv:2411.06272</a> <span> [<a href="https://arxiv.org/pdf/2411.06272">pdf</a>, <a href="https://arxiv.org/format/2411.06272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Golden Touchstone: A Comprehensive Bilingual Benchmark for Evaluating Financial Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaojun Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junxi Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Huanyi Su</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouchi Lin</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yiyan Qi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengjin Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiajun Su</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiajie Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Saizhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+F">Fengrui Hua</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06272v1-abstract-short" style="display: inline;"> As large language models become increasingly prevalent in the financial sector, there is a pressing need for a standardized method to comprehensively assess their performance. However, existing finance benchmarks often suffer from limited language and task coverage, as well as challenges such as low-quality datasets and inadequate adaptability for LLM evaluation. To address these limitations, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06272v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06272v1-abstract-full" style="display: none;"> As large language models become increasingly prevalent in the financial sector, there is a pressing need for a standardized method to comprehensively assess their performance. However, existing finance benchmarks often suffer from limited language and task coverage, as well as challenges such as low-quality datasets and inadequate adaptability for LLM evaluation. To address these limitations, we propose "Golden Touchstone", the first comprehensive bilingual benchmark for financial LLMs, which incorporates representative datasets from both Chinese and English across eight core financial NLP tasks. Developed from extensive open source data collection and industry-specific demands, this benchmark includes a variety of financial tasks aimed at thoroughly assessing models' language understanding and generation capabilities. Through comparative analysis of major models on the benchmark, such as GPT-4o Llama3, FinGPT and FinMA, we reveal their strengths and limitations in processing complex financial information. Additionally, we open-sourced Touchstone-GPT, a financial LLM trained through continual pre-training and financial instruction tuning, which demonstrates strong performance on the bilingual benchmark but still has limitations in specific tasks.This research not only provides the financial large language models with a practical evaluation tool but also guides the development and optimization of future research. The source code for Golden Touchstone and model weight of Touchstone-GPT have been made publicly available at \url{https://github.com/IDEA-FinAI/Golden-Touchstone}, contributing to the ongoing evolution of FinLLMs and fostering further research in this critical area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06272v1-abstract-full').style.display = 'none'; document.getElementById('2411.06272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 9 tables, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03814">arXiv:2411.03814</a> <span> [<a href="https://arxiv.org/pdf/2411.03814">pdf</a>, <a href="https://arxiv.org/format/2411.03814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Peng Xiao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">YueFeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chongwen Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jialing Tao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hui Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03814v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate outstanding performance in their reservoir of knowledge and understanding capabilities, but they have also been shown to be prone to illegal or unethical reactions when subjected to jailbreak attacks. To ensure their responsible deployment in critical applications, it is crucial to understand the safety capabilities and vulnerabilities of LLMs. Previous wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03814v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate outstanding performance in their reservoir of knowledge and understanding capabilities, but they have also been shown to be prone to illegal or unethical reactions when subjected to jailbreak attacks. To ensure their responsible deployment in critical applications, it is crucial to understand the safety capabilities and vulnerabilities of LLMs. Previous works mainly focus on jailbreak in single-round dialogue, overlooking the potential jailbreak risks in multi-round dialogues, which are a vital way humans interact with and extract information from LLMs. Some studies have increasingly concentrated on the risks associated with jailbreak in multi-round dialogues. These efforts typically involve the use of manually crafted templates or prompt engineering techniques. However, due to the inherent complexity of multi-round dialogues, their jailbreak performance is limited. To solve this problem, we propose a novel multi-round dialogue jailbreaking agent, emphasizing the importance of stealthiness in identifying and mitigating potential threats to human values posed by LLMs. We propose a risk decomposition strategy that distributes risks across multiple rounds of queries and utilizes psychological strategies to enhance attack strength. Extensive experiments show that our proposed method surpasses other attack methods and achieves state-of-the-art attack success rate. We will make the corresponding code and dataset available for future research. The code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03814v1-abstract-full').style.display = 'none'; document.getElementById('2411.03814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01850">arXiv:2411.01850</a> <span> [<a href="https://arxiv.org/pdf/2411.01850">pdf</a>, <a href="https://arxiv.org/format/2411.01850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ManiBox: Enhancing Spatial Grasping Generalization via Scalable Simulation Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hengkai Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuezhou Xu</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+C">Chengyang Ying</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xinyi Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01850v1-abstract-short" style="display: inline;"> Learning a precise robotic grasping policy is crucial for embodied agents operating in complex real-world manipulation tasks. Despite significant advancements, most models still struggle with accurate spatial positioning of objects to be grasped. We first show that this spatial generalization challenge stems primarily from the extensive data requirements for adequate spatial understanding. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01850v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01850v1-abstract-full" style="display: none;"> Learning a precise robotic grasping policy is crucial for embodied agents operating in complex real-world manipulation tasks. Despite significant advancements, most models still struggle with accurate spatial positioning of objects to be grasped. We first show that this spatial generalization challenge stems primarily from the extensive data requirements for adequate spatial understanding. However, collecting such data with real robots is prohibitively expensive, and relying on simulation data often leads to visual generalization gaps upon deployment. To overcome these challenges, we then focus on state-based policy generalization and present \textbf{ManiBox}, a novel bounding-box-guided manipulation method built on a simulation-based teacher-student framework. The teacher policy efficiently generates scalable simulation data using bounding boxes, which are proven to uniquely determine the objects' spatial positions. The student policy then utilizes these low-dimensional spatial states to enable zero-shot transfer to real robots. Through comprehensive evaluations in simulated and real-world environments, ManiBox demonstrates a marked improvement in spatial grasping generalization and adaptability to diverse objects and backgrounds. Further, our empirical study into scaling laws for policy performance indicates that spatial volume generalization scales positively with data volume. For a certain level of spatial volume, the success rate of grasping empirically follows Michaelis-Menten kinetics relative to data volume, showing a saturation effect as data increases. Our videos and code are available in https://thkkk.github.io/manibox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01850v1-abstract-full').style.display = 'none'; document.getElementById('2411.01850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23574">arXiv:2410.23574</a> <span> [<a href="https://arxiv.org/pdf/2410.23574">pdf</a>, <a href="https://arxiv.org/format/2410.23574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Online Convex Optimization with Memory and Limited Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+L">Lintao Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengmiao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhi-Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+M">Ming Chi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoling Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Housheng Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23574v1-abstract-short" style="display: inline;"> We study the problem of online convex optimization with memory and predictions over a horizon $T$. At each time step, a decision maker is given some limited predictions of the cost functions from a finite window of future time steps, i.e., values of the cost function at certain decision points in the future. The decision maker then chooses an action and incurs a cost given by a convex function tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23574v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23574v1-abstract-full" style="display: none;"> We study the problem of online convex optimization with memory and predictions over a horizon $T$. At each time step, a decision maker is given some limited predictions of the cost functions from a finite window of future time steps, i.e., values of the cost function at certain decision points in the future. The decision maker then chooses an action and incurs a cost given by a convex function that depends on the actions chosen in the past. We propose an algorithm to solve this problem and show that the dynamic regret of the algorithm decays exponentially with the prediction window length. Our algorithm contains two general subroutines that work for wider classes of problems. The first subroutine can solve general online convex optimization with memory and bandit feedback with $\sqrt{T}$-dynamic regret with respect to $T$. The second subroutine is a zeroth-order method that can be used to solve general convex optimization problems with a linear convergence rate that matches the best achievable rate of first-order methods for convex optimization. The key to our algorithm design and analysis is the use of truncated Gaussian smoothing when querying the decision points for obtaining the predictions. We complement our theoretical results using numerical experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23574v1-abstract-full').style.display = 'none'; document.getElementById('2410.23574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22643">arXiv:2410.22643</a> <span> [<a href="https://arxiv.org/pdf/2410.22643">pdf</a>, <a href="https://arxiv.org/format/2410.22643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An Overtaking Trajectory Planning Framework Based on Spatio-temporal Topology and Reachable Set Analysis Ensuring Time Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+W">Wule Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhouheng Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongye Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22643v1-abstract-short" style="display: inline;"> Generating overtaking trajectories in high-speed scenarios presents significant challenges and is typically addressed through hierarchical planning methods. However, this method has two primary drawbacks. First, heuristic algorithms can only provide a single initial solution, which may lead to local optima and consequently diminish the quality of the solution. Second, the time efficiency of trajec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22643v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22643v1-abstract-full" style="display: none;"> Generating overtaking trajectories in high-speed scenarios presents significant challenges and is typically addressed through hierarchical planning methods. However, this method has two primary drawbacks. First, heuristic algorithms can only provide a single initial solution, which may lead to local optima and consequently diminish the quality of the solution. Second, the time efficiency of trajectory refinement based on numerical optimization is insufficient. To overcome these limitations, this paper proposes an overtaking trajectory planning framework based on spatio-temporal topology and reachable set analysis (SROP), to improve trajectory quality and time efficiency. Specifically, this paper introduces topological classes to describe trajectories representing different overtaking behaviors, which support the spatio-temporal topological search method employed by the upper-layer planner to identify diverse initial paths. This approach helps prevent getting stuck in local optima, enhancing the overall solution quality by considering multiple initial solutions from distinct topologies. Moreover, the reachable set method is integrated into the lower-layer planner for parallel trajectory evaluation. This method enhances planning efficiency by decoupling vehicle model constraints from the optimization process, enabling parallel computation while ensuring control feasibility. Simulation results show that the proposed method improves the smoothness of generated trajectories by 66.8% compared to state-of-the-art methods, highlighting its effectiveness in enhancing trajectory quality. Additionally, this method reduces computation time by 62.9%, demonstrating its efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22643v1-abstract-full').style.display = 'none'; document.getElementById('2410.22643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21358">arXiv:2410.21358</a> <span> [<a href="https://arxiv.org/pdf/2410.21358">pdf</a>, <a href="https://arxiv.org/format/2410.21358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> "We do use it, but not how hearing people think": How the Deaf and Hard of Hearing Community Uses Large Language Model Tools </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huffman%2C+S">Shuxu Huffman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Mack%2C+K+A">Kelly Avery Mack</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Haotian Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Kushalnagar%2C+R">Raja Kushalnagar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21358v2-abstract-short" style="display: inline;"> Generative AI tools, particularly those utilizing large language models (LLMs), have become increasingly prevalent in both professional and personal contexts, offering powerful capabilities for text generation and communication support. While these tools are widely used to enhance productivity and accessibility, there has been limited exploration of how Deaf and Hard of Hearing (DHH) individuals e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21358v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21358v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21358v2-abstract-full" style="display: none;"> Generative AI tools, particularly those utilizing large language models (LLMs), have become increasingly prevalent in both professional and personal contexts, offering powerful capabilities for text generation and communication support. While these tools are widely used to enhance productivity and accessibility, there has been limited exploration of how Deaf and Hard of Hearing (DHH) individuals engage with text-based generative AI tools, as well as the challenges they may encounter. This paper presents a mixed-method survey study investigating how the DHH community uses Text AI tools, such as ChatGPT, to reduce communication barriers, bridge Deaf and hearing cultures, and improve access to information. Through a survey of 80 DHH participants and separate interviews with 11 other participants, we found that while these tools provide significant benefits, including enhanced communication and mental health support, they also introduce barriers, such as a lack of American Sign Language (ASL) support and understanding of Deaf cultural nuances. Our findings highlight unique usage patterns within the DHH community and underscore the need for inclusive design improvements. We conclude by offering practical recommendations to enhance the accessibility of Text AI for the DHH community and suggest directions for future research in AI and accessibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21358v2-abstract-full').style.display = 'none'; document.getElementById('2410.21358v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18974">arXiv:2410.18974</a> <span> [<a href="https://arxiv.org/pdf/2410.18974">pdf</a>, <a href="https://arxiv.org/format/2410.18974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 3D-Adapter: Geometry-Consistent Multi-View Diffusion for High-Quality 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hansheng Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bokui Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yulin Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ruoxi Shi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C+Z">Connor Z. Lin</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiayuan Gu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18974v1-abstract-short" style="display: inline;"> Multi-view image diffusion models have significantly advanced open-domain 3D object generation. However, most existing models rely on 2D network architectures that lack inherent 3D biases, resulting in compromised geometric consistency. To address this challenge, we introduce 3D-Adapter, a plug-in module designed to infuse 3D geometry awareness into pretrained image diffusion models. Central to ou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18974v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18974v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18974v1-abstract-full" style="display: none;"> Multi-view image diffusion models have significantly advanced open-domain 3D object generation. However, most existing models rely on 2D network architectures that lack inherent 3D biases, resulting in compromised geometric consistency. To address this challenge, we introduce 3D-Adapter, a plug-in module designed to infuse 3D geometry awareness into pretrained image diffusion models. Central to our approach is the idea of 3D feedback augmentation: for each denoising step in the sampling loop, 3D-Adapter decodes intermediate multi-view features into a coherent 3D representation, then re-encodes the rendered RGBD views to augment the pretrained base model through feature addition. We study two variants of 3D-Adapter: a fast feed-forward version based on Gaussian splatting and a versatile training-free version utilizing neural fields and meshes. Our extensive experiments demonstrate that 3D-Adapter not only greatly enhances the geometry quality of text-to-multi-view models such as Instant3D and Zero123++, but also enables high-quality 3D generation using the plain text-to-image Stable Diffusion. Furthermore, we showcase the broad application potential of 3D-Adapter by presenting high quality results in text-to-3D, image-to-3D, text-to-texture, and text-to-avatar tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18974v1-abstract-full').style.display = 'none'; document.getElementById('2410.18974v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://lakonik.github.io/3d-adapter/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14081">arXiv:2410.14081</a> <span> [<a href="https://arxiv.org/pdf/2410.14081">pdf</a>, <a href="https://arxiv.org/format/2410.14081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reward-free World Models for Online Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shangzhe Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiao Huang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14081v1-abstract-short" style="display: inline;"> Imitation learning (IL) enables agents to acquire skills directly from expert demonstrations, providing a compelling alternative to reinforcement learning. However, prior online IL approaches struggle with complex tasks characterized by high-dimensional inputs and complex dynamics. In this work, we propose a novel approach to online imitation learning that leverages reward-free world models. Our m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14081v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14081v1-abstract-full" style="display: none;"> Imitation learning (IL) enables agents to acquire skills directly from expert demonstrations, providing a compelling alternative to reinforcement learning. However, prior online IL approaches struggle with complex tasks characterized by high-dimensional inputs and complex dynamics. In this work, we propose a novel approach to online imitation learning that leverages reward-free world models. Our method learns environmental dynamics entirely in latent spaces without reconstruction, enabling efficient and accurate modeling. We adopt the inverse soft-Q learning objective, reformulating the optimization process in the Q-policy space to mitigate the instability associated with traditional optimization in the reward-policy space. By employing a learned latent dynamics model and planning for control, our approach consistently achieves stable, expert-level performance in tasks with high-dimensional observation or action spaces and intricate dynamics. We evaluate our method on a diverse set of benchmarks, including DMControl, MyoSuite, and ManiSkill2, demonstrating superior empirical performance compared to existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14081v1-abstract-full').style.display = 'none'; document.getElementById('2410.14081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13116">arXiv:2410.13116</a> <span> [<a href="https://arxiv.org/pdf/2410.13116">pdf</a>, <a href="https://arxiv.org/format/2410.13116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning to Summarize from LLM-generated Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+H">Hwanjun Song</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+T">Taewon Yun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yuho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gihun Lee</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jason Cai</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13116v1-abstract-short" style="display: inline;"> Developing effective text summarizers remains a challenge due to issues like hallucinations, key information omissions, and verbosity in LLM-generated summaries. This work explores using LLM-generated feedback to improve summary quality by aligning the summaries with human preferences for faithfulness, completeness, and conciseness. We introduce FeedSum, a large-scale dataset containing multi-dime… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13116v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13116v1-abstract-full" style="display: none;"> Developing effective text summarizers remains a challenge due to issues like hallucinations, key information omissions, and verbosity in LLM-generated summaries. This work explores using LLM-generated feedback to improve summary quality by aligning the summaries with human preferences for faithfulness, completeness, and conciseness. We introduce FeedSum, a large-scale dataset containing multi-dimensional LLM feedback on summaries of varying quality across diverse domains. Our experiments show how feedback quality, dimensionality, and granularity influence preference learning, revealing that high-quality, multi-dimensional, fine-grained feedback significantly improves summary generation. We also compare two methods for using this feedback: supervised fine-tuning and direct preference optimization. Finally, we introduce SummLlama3-8b, a model that outperforms the nearly 10x larger Llama3-70b-instruct in generating human-preferred summaries, demonstrating that smaller models can achieve superior performance with appropriate training. The full dataset will be released soon. The SummLlama3-8B model is now available at https://huggingface.co/DISLab/SummLlama3-8B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13116v1-abstract-full').style.display = 'none'; document.getElementById('2410.13116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12074">arXiv:2410.12074</a> <span> [<a href="https://arxiv.org/pdf/2410.12074">pdf</a>, <a href="https://arxiv.org/format/2410.12074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> nvTorchCam: An Open-source Library for Camera-Agnostic Differentiable Geometric Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lichy%2C+D">Daniel Lichy</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Badki%2C+A">Abhishek Badki</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Gallo%2C+O">Orazio Gallo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12074v1-abstract-short" style="display: inline;"> We introduce nvTorchCam, an open-source library under the Apache 2.0 license, designed to make deep learning algorithms camera model-independent. nvTorchCam abstracts critical camera operations such as projection and unprojection, allowing developers to implement algorithms once and apply them across diverse camera models--including pinhole, fisheye, and 360 equirectangular panoramas, which are co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12074v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12074v1-abstract-full" style="display: none;"> We introduce nvTorchCam, an open-source library under the Apache 2.0 license, designed to make deep learning algorithms camera model-independent. nvTorchCam abstracts critical camera operations such as projection and unprojection, allowing developers to implement algorithms once and apply them across diverse camera models--including pinhole, fisheye, and 360 equirectangular panoramas, which are commonly used in automotive and real estate capture applications. Built on PyTorch, nvTorchCam is fully differentiable and supports GPU acceleration and batching for efficient computation. Furthermore, deep learning models trained for one camera type can be directly transferred to other camera types without requiring additional modification. In this paper, we provide an overview of nvTorchCam, its functionality, and present various code examples and diagrams to demonstrate its usage. Source code and installation instructions can be found on the nvTorchCam GitHub page at https://github.com/NVlabs/nvTorchCam. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12074v1-abstract-full').style.display = 'none'; document.getElementById('2410.12074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Source code and installation instructions are available at https://github.com/NVlabs/nvTorchCam</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11570">arXiv:2410.11570</a> <span> [<a href="https://arxiv.org/pdf/2410.11570">pdf</a>, <a href="https://arxiv.org/format/2410.11570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Data-Driven Aggressive Autonomous Racing Framework Utilizing Local Trajectory Planning with Velocity Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhouheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bei Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Cheng Hu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongye Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11570v1-abstract-short" style="display: inline;"> The development of autonomous driving has boosted the research on autonomous racing. However, existing local trajectory planning methods have difficulty planning trajectories with optimal velocity profiles at racetracks with sharp corners, thus weakening the performance of autonomous racing. To address this problem, we propose a local trajectory planning method that integrates Velocity Prediction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11570v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11570v1-abstract-full" style="display: none;"> The development of autonomous driving has boosted the research on autonomous racing. However, existing local trajectory planning methods have difficulty planning trajectories with optimal velocity profiles at racetracks with sharp corners, thus weakening the performance of autonomous racing. To address this problem, we propose a local trajectory planning method that integrates Velocity Prediction based on Model Predictive Contour Control (VPMPCC). The optimal parameters of VPMPCC are learned through Bayesian Optimization (BO) based on a proposed novel Objective Function adapted to Racing (OFR). Specifically, VPMPCC achieves velocity prediction by encoding the racetrack as a reference velocity profile and incorporating it into the optimization problem. This method optimizes the velocity profile of local trajectories, especially at corners with significant curvature. The proposed OFR balances racing performance with vehicle safety, ensuring safe and efficient BO training. In the simulation, the number of training iterations for OFR-based BO is reduced by 42.86% compared to the state-of-the-art method. The optimal simulation-trained parameters are then applied to a real-world F1TENTH vehicle without retraining. During prolonged racing on a custom-built racetrack featuring significant sharp corners, the mean velocity of VPMPCC reaches 93.18% of the vehicle's handling limits. The released code is available at https://github.com/zhouhengli/VPMPCC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11570v1-abstract-full').style.display = 'none'; document.getElementById('2410.11570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09403">arXiv:2410.09403</a> <span> [<a href="https://arxiv.org/pdf/2410.09403">pdf</a>, <a href="https://arxiv.org/format/2410.09403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Two Heads Are Better Than One: A Multi-Agent System Has the Potential to Improve Scientific Idea Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Haoyang Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Renqi Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xinzhe Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingzhe Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+N">Nanqing Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09403v1-abstract-short" style="display: inline;"> The rapid advancement of scientific progress requires innovative tools that can accelerate discovery. While recent AI methods, particularly large language models (LLMs), have shown promise in tasks such as hypothesis generation and experimental design, they fall short in replicating the collaborative nature of real-world scientific practices, where diverse teams of experts work together to tackle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09403v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09403v1-abstract-full" style="display: none;"> The rapid advancement of scientific progress requires innovative tools that can accelerate discovery. While recent AI methods, particularly large language models (LLMs), have shown promise in tasks such as hypothesis generation and experimental design, they fall short in replicating the collaborative nature of real-world scientific practices, where diverse teams of experts work together to tackle complex problems. To address the limitation, we propose an LLM-based multi-agent system, i.e., Virtual Scientists (VirSci), designed to mimic the teamwork inherent in scientific research. VirSci organizes a team of agents to collaboratively generate, evaluate, and refine research ideas. Through comprehensive experiments, we demonstrate that this multi-agent approach outperforms the state-of-the-art method in producing novel and impactful scientific ideas, showing potential in aligning with key insights in the Science of Science field. Our findings suggest that integrating collaborative agents can lead to more innovative scientific outputs, offering a robust system for autonomous scientific discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09403v1-abstract-full').style.display = 'none'; document.getElementById('2410.09403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09347">arXiv:2410.09347</a> <span> [<a href="https://arxiv.org/pdf/2410.09347">pdf</a>, <a href="https://arxiv.org/format/2410.09347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Toward Guidance-Free AR Visual Generation via Condition Contrastive Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peize Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09347v1-abstract-short" style="display: inline;"> Classifier-Free Guidance (CFG) is a critical technique for enhancing the sample quality of visual generative models. However, in autoregressive (AR) multi-modal generation, CFG introduces design inconsistencies between language and visual content, contradicting the design philosophy of unifying different modalities for visual AR. Motivated by language model alignment methods, we propose \textit{Co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09347v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09347v1-abstract-full" style="display: none;"> Classifier-Free Guidance (CFG) is a critical technique for enhancing the sample quality of visual generative models. However, in autoregressive (AR) multi-modal generation, CFG introduces design inconsistencies between language and visual content, contradicting the design philosophy of unifying different modalities for visual AR. Motivated by language model alignment methods, we propose \textit{Condition Contrastive Alignment} (CCA) to facilitate guidance-free AR visual generation with high performance and analyze its theoretical connection with guided sampling methods. Unlike guidance methods that alter the sampling process to achieve the ideal sampling distribution, CCA directly fine-tunes pretrained models to fit the same distribution target. Experimental results show that CCA can significantly enhance the guidance-free performance of all tested models with just one epoch of fine-tuning ($\sim$ 1\% of pretraining epochs) on the pretraining dataset, on par with guided sampling methods. This largely removes the need for guided sampling in AR visual generation and cuts the sampling cost by half. Moreover, by adjusting training parameters, CCA can achieve trade-offs between sample diversity and fidelity similar to CFG. This experimentally confirms the strong theoretical connection between language-targeted alignment and visual-targeted guidance methods, unifying two previously independent research fields. Code and model weights: https://github.com/thu-ml/CCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09347v1-abstract-full').style.display = 'none'; document.getElementById('2410.09347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07864">arXiv:2410.07864</a> <span> [<a href="https://arxiv.org/pdf/2410.07864">pdf</a>, <a href="https://arxiv.org/format/2410.07864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songming Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lingxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bangguo Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hengkai Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07864v1-abstract-short" style="display: inline;"> Bimanual manipulation is essential in robotics, yet developing foundation models is extremely challenging due to the inherent complexity of coordinating two robot arms (leading to multi-modal action distributions) and the scarcity of training data. In this paper, we present the Robotics Diffusion Transformer (RDT), a pioneering diffusion foundation model for bimanual manipulation. RDT builds on di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07864v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07864v1-abstract-full" style="display: none;"> Bimanual manipulation is essential in robotics, yet developing foundation models is extremely challenging due to the inherent complexity of coordinating two robot arms (leading to multi-modal action distributions) and the scarcity of training data. In this paper, we present the Robotics Diffusion Transformer (RDT), a pioneering diffusion foundation model for bimanual manipulation. RDT builds on diffusion models to effectively represent multi-modality, with innovative designs of a scalable Transformer to deal with the heterogeneity of multi-modal inputs and to capture the nonlinearity and high frequency of robotic data. To address data scarcity, we further introduce a Physically Interpretable Unified Action Space, which can unify the action representations of various robots while preserving the physical meanings of original actions, facilitating learning transferrable physical knowledge. With these designs, we managed to pre-train RDT on the largest collection of multi-robot datasets to date and scaled it up to 1.2B parameters, which is the largest diffusion-based foundation model for robotic manipulation. We finally fine-tuned RDT on a self-created multi-task bimanual dataset with over 6K+ episodes to refine its manipulation capabilities. Experiments on real robots demonstrate that RDT significantly outperforms existing methods. It exhibits zero-shot generalization to unseen objects and scenes, understands and follows language instructions, learns new skills with just 1~5 demonstrations, and effectively handles complex, dexterous tasks. We refer to https://rdt-robotics.github.io/rdt-robotics/ for the code and videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07864v1-abstract-full').style.display = 'none'; document.getElementById('2410.07864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06729">arXiv:2410.06729</a> <span> [<a href="https://arxiv.org/pdf/2410.06729">pdf</a>, <a href="https://arxiv.org/format/2410.06729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Perceptual Quality Assessment of Octree-RAHT Encoded 3D Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+D">Dongshuai Duan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Honglei Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wei Gao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiarun Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06729v2-abstract-short" style="display: inline;"> No-reference bitstream-layer point cloud quality assessment (PCQA) can be deployed without full decoding at any network node to achieve real-time quality monitoring. In this work, we focus on the PCQA problem dedicated to Octree-RAHT encoding mode. First, to address the issue that existing PCQA databases have a small scale and limited distortion levels, we establish the WPC5.0 database which is th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06729v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06729v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06729v2-abstract-full" style="display: none;"> No-reference bitstream-layer point cloud quality assessment (PCQA) can be deployed without full decoding at any network node to achieve real-time quality monitoring. In this work, we focus on the PCQA problem dedicated to Octree-RAHT encoding mode. First, to address the issue that existing PCQA databases have a small scale and limited distortion levels, we establish the WPC5.0 database which is the first one dedicated to Octree-RAHT encoding mode with a scale of 400 distorted point clouds (PCs) including 4 geometric multiplied by 5 attitude distortion levels. Then, we propose the first PCQA model dedicated to Octree-RAHT encoding mode by parsing PC bitstreams without full decoding. The model introduces texture bitrate (TBPP) to predict texture complexity (TC) and further derives the texture distortion factor. In addition, the Geometric Quantization Parameter (PQS) is used to estimate the geometric distortion factor, which is then integrated into the model along with the texture distortion factor to obtain the proposed PCQA model named streamPCQ-OR. The proposed model has been compared with other advanced PCQA methods on the WPC5.0, BASICS and M-PCCD databases, and experimental results show that our model has excellent performance while having very low computational complexity, providing a reliable choice for time-critical applications. To facilitate subsequent research, the database and source code will be publicly released at https://github.com/qdushl/Waterloo-Point-Cloud-Database-5.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06729v2-abstract-full').style.display = 'none'; document.getElementById('2410.06729v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06689">arXiv:2410.06689</a> <span> [<a href="https://arxiv.org/pdf/2410.06689">pdf</a>, <a href="https://arxiv.org/format/2410.06689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Perceptual Quality Assessment of Trisoup-Lifting Encoded 3D Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+J">Juncheng Long</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Honglei Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wei Gao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiarun Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06689v2-abstract-short" style="display: inline;"> No-reference bitstream-layer point cloud quality assessment (PCQA) can be deployed without full decoding at any network node to achieve real-time quality monitoring. In this work, we develop the first PCQA model dedicated to Trisoup-Lifting encoded 3D point clouds by analyzing bitstreams without full decoding. Specifically, we investigate the relationship among texture bitrate per point (TBPP), te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06689v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06689v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06689v2-abstract-full" style="display: none;"> No-reference bitstream-layer point cloud quality assessment (PCQA) can be deployed without full decoding at any network node to achieve real-time quality monitoring. In this work, we develop the first PCQA model dedicated to Trisoup-Lifting encoded 3D point clouds by analyzing bitstreams without full decoding. Specifically, we investigate the relationship among texture bitrate per point (TBPP), texture complexity (TC) and texture quantization parameter (TQP) while geometry encoding is lossless. Subsequently, we estimate TC by utilizing TQP and TBPP. Then, we establish a texture distortion evaluation model based on TC, TBPP and TQP. Ultimately, by integrating this texture distortion model with a geometry attenuation factor, a function of trisoupNodeSizeLog2 (tNSL), we acquire a comprehensive NR bitstream-layer PCQA model named streamPCQ-TL. In addition, this work establishes a database named WPC6.0, the first and largest PCQA database dedicated to Trisoup-Lifting encoding mode, encompassing 400 distorted point clouds with both 4 geometric multiplied by 5 texture distortion levels. Experiment results on M-PCCD, ICIP2020 and the proposed WPC6.0 database suggest that the proposed streamPCQ-TL model exhibits robust and notable performance in contrast to existing advanced PCQA metrics, particularly in terms of computational cost. The dataset and source code will be publicly released at https://github.com/qdushl/Waterloo-Point-Cloud-Database-6.0 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06689v2-abstract-full').style.display = 'none'; document.getElementById('2410.06689v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05740">arXiv:2410.05740</a> <span> [<a href="https://arxiv.org/pdf/2410.05740">pdf</a>, <a href="https://arxiv.org/format/2410.05740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Learning to Race in Extreme Turning Scene with Active Exploration and Gaussian Process Regression-based MPC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guoqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Cheng Hu</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+W">Wangjia Weng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhouheng Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yonghao Fu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongye Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05740v1-abstract-short" style="display: inline;"> Extreme cornering in racing often induces large side-slip angles, presenting a formidable challenge in vehicle control. To tackle this issue, this paper introduces an Active Exploration with Double GPR (AEDGPR) system. The system initiates by planning a minimum-time trajectory with a Gaussian Process Regression(GPR) compensated model. The planning results show that in the cornering section, the ya… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05740v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05740v1-abstract-full" style="display: none;"> Extreme cornering in racing often induces large side-slip angles, presenting a formidable challenge in vehicle control. To tackle this issue, this paper introduces an Active Exploration with Double GPR (AEDGPR) system. The system initiates by planning a minimum-time trajectory with a Gaussian Process Regression(GPR) compensated model. The planning results show that in the cornering section, the yaw angular velocity and side-slip angle are in opposite directions, indicating that the vehicle is drifting. In response, we develop a drift controller based on Model Predictive Control (MPC) and incorporate Gaussian Process Regression to correct discrepancies in the vehicle dynamics model. Moreover, the covariance from the GPR is employed to actively explore various cornering states, aiming to minimize trajectory tracking errors. The proposed algorithm is validated through simulations on the Simulink-Carsim platform and experiments using a 1/10 scale RC vehicle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05740v1-abstract-full').style.display = 'none'; document.getElementById('2410.05740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05323">arXiv:2410.05323</a> <span> [<a href="https://arxiv.org/pdf/2410.05323">pdf</a>, <a href="https://arxiv.org/format/2410.05323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From Incomplete Coarse-Grained to Complete Fine-Grained: A Two-Stage Framework for Spatiotemporal Data Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Ziyu Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Haoyang Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+E">En Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Funing Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yongjian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenbin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05323v1-abstract-short" style="display: inline;"> With the rapid development of various sensing devices, spatiotemporal data is becoming increasingly important nowadays. However, due to sensing costs and privacy concerns, the collected data is often incomplete and coarse-grained, limiting its application to specific tasks. To address this, we propose a new task called spatiotemporal data reconstruction, which aims to infer complete and fine-grain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05323v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05323v1-abstract-full" style="display: none;"> With the rapid development of various sensing devices, spatiotemporal data is becoming increasingly important nowadays. However, due to sensing costs and privacy concerns, the collected data is often incomplete and coarse-grained, limiting its application to specific tasks. To address this, we propose a new task called spatiotemporal data reconstruction, which aims to infer complete and fine-grained data from sparse and coarse-grained observations. To achieve this, we introduce a two-stage data inference framework, DiffRecon, grounded in the Denoising Diffusion Probabilistic Model (DDPM). In the first stage, we present Diffusion-C, a diffusion model augmented by ST-PointFormer, a powerful encoder designed to leverage the spatial correlations between sparse data points. Following this, the second stage introduces Diffusion-F, which incorporates the proposed T-PatternNet to capture the temporal pattern within sequential data. Together, these two stages form an end-to-end framework capable of inferring complete, fine-grained data from incomplete and coarse-grained observations. We conducted experiments on multiple real-world datasets to demonstrate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05323v1-abstract-full').style.display = 'none'; document.getElementById('2410.05323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01308">arXiv:2410.01308</a> <span> [<a href="https://arxiv.org/pdf/2410.01308">pdf</a>, <a href="https://arxiv.org/ps/2410.01308">ps</a>, <a href="https://arxiv.org/format/2410.01308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking the Expressiveness of GNNs: A Computational Model Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+G">Guanyu Cui</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhewei Wei</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hsin-Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01308v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) are extensively employed in graph machine learning, with considerable research focusing on their expressiveness. Current studies often assess GNN expressiveness by comparing them to the Weisfeiler-Lehman (WL) tests or classical graph algorithms. However, we identify three key issues in existing analyses: (1) some studies use preprocessing to enhance expressiveness but… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01308v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01308v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) are extensively employed in graph machine learning, with considerable research focusing on their expressiveness. Current studies often assess GNN expressiveness by comparing them to the Weisfeiler-Lehman (WL) tests or classical graph algorithms. However, we identify three key issues in existing analyses: (1) some studies use preprocessing to enhance expressiveness but overlook its computational costs; (2) some claim the anonymous WL test's limited power while enhancing expressiveness using non-anonymous features, creating a mismatch; and (3) some characterize message-passing GNNs (MPGNNs) with the CONGEST model but make unrealistic assumptions about computational resources, allowing $\textsf{NP-Complete}$ problems to be solved in $O(m)$ depth. We contend that a well-defined computational model is urgently needed to serve as the foundation for discussions on GNN expressiveness. To address these issues, we introduce the Resource-Limited CONGEST (RL-CONGEST) model, incorporating optional preprocessing and postprocessing to form a framework for analyzing GNN expressiveness. Our framework sheds light on computational aspects, including the computational hardness of hash functions in the WL test and the role of virtual nodes in reducing network capacity. Additionally, we suggest that high-order GNNs correspond to first-order model-checking problems, offering new insights into their expressiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01308v1-abstract-full').style.display = 'none'; document.getElementById('2410.01308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> + </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00425">arXiv:2410.00425</a> <span> [<a href="https://arxiv.org/pdf/2410.00425">pdf</a>, <a href="https://arxiv.org/format/2410.00425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ManiSkill3: GPU Parallelized Robotics Simulation and Rendering for Generalizable Embodied AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+S">Stone Tao</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+F">Fanbo Xiang</a>, <a href="/search/cs?searchtype=author&query=Shukla%2C+A">Arth Shukla</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yuzhe Qin</a>, <a href="/search/cs?searchtype=author&query=Hinrichsen%2C+X">Xander Hinrichsen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaodi Yuan</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chen Bao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xinsong Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yulin Liu</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+T">Tse-kai Chan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuanlin Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+T">Tongzhou Mu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+N">Nan Xiao</a>, <a href="/search/cs?searchtype=author&query=Gurha%2C+A">Arnav Gurha</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiao Huang</a>, <a href="/search/cs?searchtype=author&query=Calandra%2C+R">Roberto Calandra</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00425v1-abstract-short" style="display: inline;"> Simulation has enabled unprecedented compute-scalable approaches to robot learning. However, many existing simulation frameworks typically support a narrow range of scenes/tasks and lack features critical for scaling generalizable robotics and sim2real. We introduce and open source ManiSkill3, the fastest state-visual GPU parallelized robotics simulator with contact-rich physics targeting generali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00425v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00425v1-abstract-full" style="display: none;"> Simulation has enabled unprecedented compute-scalable approaches to robot learning. However, many existing simulation frameworks typically support a narrow range of scenes/tasks and lack features critical for scaling generalizable robotics and sim2real. We introduce and open source ManiSkill3, the fastest state-visual GPU parallelized robotics simulator with contact-rich physics targeting generalizable manipulation. ManiSkill3 supports GPU parallelization of many aspects including simulation+rendering, heterogeneous simulation, pointclouds/voxels visual input, and more. Simulation with rendering on ManiSkill3 can run 10-1000x faster with 2-3x less GPU memory usage than other platforms, achieving up to 30,000+ FPS in benchmarked environments due to minimal python/pytorch overhead in the system, simulation on the GPU, and the use of the SAPIEN parallel rendering system. Tasks that used to take hours to train can now take minutes. We further provide the most comprehensive range of GPU parallelized environments/tasks spanning 12 distinct domains including but not limited to mobile manipulation for tasks such as drawing, humanoids, and dextrous manipulation in realistic scenes designed by artists or real-world digital twins. In addition, millions of demonstration frames are provided from motion planning, RL, and teleoperation. ManiSkill3 also provides a comprehensive set of baselines that span popular RL and learning-from-demonstrations algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00425v1-abstract-full').style.display = 'none'; document.getElementById('2410.00425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: http://maniskill.ai/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00194">arXiv:2410.00194</a> <span> [<a href="https://arxiv.org/pdf/2410.00194">pdf</a>, <a href="https://arxiv.org/format/2410.00194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> "Real Learner Data Matters" Exploring the Design of LLM-Powered Question Generation for Deaf and Hard of Hearing Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Si Cheng</a>, <a href="/search/cs?searchtype=author&query=Huffman%2C+S">Shuxu Huffman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qingxiaoyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Haotian Su</a>, <a href="/search/cs?searchtype=author&query=Kushalnagar%2C+R">Raja Kushalnagar</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00194v1-abstract-short" style="display: inline;"> Deaf and Hard of Hearing (DHH) learners face unique challenges in learning environments, often due to a lack of tailored educational materials that address their specific needs. This study explores the potential of Large Language Models (LLMs) to generate personalized quiz questions to enhance DHH students' video-based learning experiences. We developed a prototype leveraging LLMs to generate ques… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00194v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00194v1-abstract-full" style="display: none;"> Deaf and Hard of Hearing (DHH) learners face unique challenges in learning environments, often due to a lack of tailored educational materials that address their specific needs. This study explores the potential of Large Language Models (LLMs) to generate personalized quiz questions to enhance DHH students' video-based learning experiences. We developed a prototype leveraging LLMs to generate questions with emphasis on two unique strategies: Visual Questions, which identify video segments where visual information might be misrepresented, and Emotion Questions, which highlight moments where previous DHH learners experienced learning difficulty manifested in emotional responses. Through user studies with DHH undergraduates, we evaluated the effectiveness of these LLM-generated questions in supporting the learning experience. Our findings indicate that while LLMs offer significant potential for personalized learning, challenges remain in the interaction accessibility for the diverse DHH community. The study highlights the importance of considering language diversity and culture in LLM-based educational technology design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00194v1-abstract-full').style.display = 'none'; document.getElementById('2410.00194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19898">arXiv:2409.19898</a> <span> [<a href="https://arxiv.org/pdf/2409.19898">pdf</a>, <a href="https://arxiv.org/format/2409.19898">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UniSumEval: Towards Unified, Fine-Grained, Multi-Dimensional Summarization Evaluation for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yuho Lee</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+T">Taewon Yun</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jason Cai</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hwanjun Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19898v2-abstract-short" style="display: inline;"> Existing benchmarks for summarization quality evaluation often lack diverse input scenarios, focus on narrowly defined dimensions (e.g., faithfulness), and struggle with subjective and coarse-grained annotation schemes. To address these shortcomings, we create UniSumEval benchmark, which extends the range of input context (e.g., domain, length) and provides fine-grained, multi-dimensional annotati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19898v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19898v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19898v2-abstract-full" style="display: none;"> Existing benchmarks for summarization quality evaluation often lack diverse input scenarios, focus on narrowly defined dimensions (e.g., faithfulness), and struggle with subjective and coarse-grained annotation schemes. To address these shortcomings, we create UniSumEval benchmark, which extends the range of input context (e.g., domain, length) and provides fine-grained, multi-dimensional annotations. We use AI assistance in data creation, identifying potentially hallucinogenic input texts, and also helping human annotators reduce the difficulty of fine-grained annotation tasks. With UniSumEval, we benchmark nine latest language models as summarizers, offering insights into their performance across varying input contexts and evaluation dimensions. Furthermore, we conduct a thorough comparison of SOTA automated summary evaluators. Our benchmark data will be available at https://github.com/DISL-Lab/UniSumEval-v1.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19898v2-abstract-full').style.display = 'none'; document.getElementById('2409.19898v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP-Findings 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14324">arXiv:2409.14324</a> <span> [<a href="https://arxiv.org/pdf/2409.14324">pdf</a>, <a href="https://arxiv.org/format/2409.14324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Narrative Reasoning Limits of Large Language Models with Trope in Movie Synopses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Hung-Ting Su</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+Y">Ya-Ching Hsu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xudong Lin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiang-Qian Shi</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yulei Niu</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+H">Han-Yuan Hsu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W+H">Winston H. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14324v1-abstract-short" style="display: inline;"> Large language models (LLMs) equipped with chain-of-thoughts (CoT) prompting have shown significant multi-step reasoning capabilities in factual content like mathematics, commonsense, and logic. However, their performance in narrative reasoning, which demands greater abstraction capabilities, remains unexplored. This study utilizes tropes in movie synopses to assess the abstract reasoning abilitie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14324v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14324v1-abstract-full" style="display: none;"> Large language models (LLMs) equipped with chain-of-thoughts (CoT) prompting have shown significant multi-step reasoning capabilities in factual content like mathematics, commonsense, and logic. However, their performance in narrative reasoning, which demands greater abstraction capabilities, remains unexplored. This study utilizes tropes in movie synopses to assess the abstract reasoning abilities of state-of-the-art LLMs and uncovers their low performance. We introduce a trope-wise querying approach to address these challenges and boost the F1 score by 11.8 points. Moreover, while prior studies suggest that CoT enhances multi-step reasoning, this study shows CoT can cause hallucinations in narrative content, reducing GPT-4's performance. We also introduce an Adversarial Injection method to embed trope-related text tokens into movie synopses without explicit tropes, revealing CoT's heightened sensitivity to such injections. Our comprehensive analysis provides insights for future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14324v1-abstract-full').style.display = 'none'; document.getElementById('2409.14324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings. The first two authors contributed equally. Code: https://github.com/Shelley1214/Trope</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12946">arXiv:2409.12946</a> <span> [<a href="https://arxiv.org/pdf/2409.12946">pdf</a>, <a href="https://arxiv.org/format/2409.12946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Semi-supervised Adversarial Robustness via Noise-aware Online Robust Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tsung-Han Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hung-Ting Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shang-Tse Chen</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W+H">Winston H. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12946v1-abstract-short" style="display: inline;"> The robust self-training (RST) framework has emerged as a prominent approach for semi-supervised adversarial training. To explore the possibility of tackling more complicated tasks with even lower labeling budgets, unlike prior approaches that rely on robust pretrained models, we present SNORD - a simple yet effective framework that introduces contemporary semi-supervised learning techniques into… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12946v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12946v1-abstract-full" style="display: none;"> The robust self-training (RST) framework has emerged as a prominent approach for semi-supervised adversarial training. To explore the possibility of tackling more complicated tasks with even lower labeling budgets, unlike prior approaches that rely on robust pretrained models, we present SNORD - a simple yet effective framework that introduces contemporary semi-supervised learning techniques into the realm of adversarial training. By enhancing pseudo labels and managing noisy training data more effectively, SNORD showcases impressive, state-of-the-art performance across diverse datasets and labeling budgets, all without the need for pretrained models. Compared to full adversarial supervision, SNORD achieves a 90% relative robust accuracy under epsilon = 8/255 AutoAttack, requiring less than 0.1%, 2%, and 10% labels for CIFAR-10, CIFAR-100, and TinyImageNet-200, respectively. Additional experiments confirm the efficacy of each component and demonstrate the adaptability of integrating SNORD with existing adversarial pretraining strategies to further bolster robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12946v1-abstract-full').style.display = 'none'; document.getElementById('2409.12946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09777">arXiv:2409.09777</a> <span> [<a href="https://arxiv.org/pdf/2409.09777">pdf</a>, <a href="https://arxiv.org/format/2409.09777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DiFSD: Ego-Centric Fully Sparse Paradigm with Uncertainty Denoising and Iterative Refinement for Efficient End-to-End Self-Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Haisheng Su</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09777v3-abstract-short" style="display: inline;"> Current end-to-end autonomous driving methods resort to unifying modular designs for various tasks (e.g. perception, prediction and planning). Although optimized in a planning-oriented spirit with a fully differentiable framework, existing end-to-end driving systems without ego-centric designs still suffer from unsatisfactory performance and inferior efficiency, owing to the rasterized scene repre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09777v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09777v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09777v3-abstract-full" style="display: none;"> Current end-to-end autonomous driving methods resort to unifying modular designs for various tasks (e.g. perception, prediction and planning). Although optimized in a planning-oriented spirit with a fully differentiable framework, existing end-to-end driving systems without ego-centric designs still suffer from unsatisfactory performance and inferior efficiency, owing to the rasterized scene representation learning and redundant information transmission. In this paper, we revisit the human driving behavior and propose an ego-centric fully sparse paradigm, named DiFSD, for end-to-end self-driving. Specifically, DiFSD mainly consists of sparse perception, hierarchical interaction and iterative motion planner. The sparse perception module performs detection, tracking and online mapping based on sparse representation of the driving scene. The hierarchical interaction module aims to select the Closest In-Path Vehicle / Stationary (CIPV / CIPS) from coarse to fine, benefiting from an additional geometric prior. As for the iterative motion planner, both selected interactive agents and ego-vehicle are considered for joint motion prediction, where the output multi-modal ego-trajectories are optimized in an iterative fashion. Besides, both position-level motion diffusion and trajectory-level planning denoising are introduced for uncertainty modeling, thus facilitating the training stability and convergence of the whole framework. Extensive experiments conducted on nuScenes and Bench2Drive datasets demonstrate the superior planning performance and great efficiency of DiFSD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09777v3-abstract-full').style.display = 'none'; document.getElementById('2409.09777v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09591">arXiv:2409.09591</a> <span> [<a href="https://arxiv.org/pdf/2409.09591">pdf</a>, <a href="https://arxiv.org/format/2409.09591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open-World Test-Time Training: Self-Training with Contrast Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Houcheng Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengzhu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingli Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Daixian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09591v1-abstract-short" style="display: inline;"> Traditional test-time training (TTT) methods, while addressing domain shifts, often assume a consistent class set, limiting their applicability in real-world scenarios characterized by infinite variety. Open-World Test-Time Training (OWTTT) addresses the challenge of generalizing deep learning models to unknown target domain distributions, especially in the presence of strong Out-of-Distribution (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09591v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09591v1-abstract-full" style="display: none;"> Traditional test-time training (TTT) methods, while addressing domain shifts, often assume a consistent class set, limiting their applicability in real-world scenarios characterized by infinite variety. Open-World Test-Time Training (OWTTT) addresses the challenge of generalizing deep learning models to unknown target domain distributions, especially in the presence of strong Out-of-Distribution (OOD) data. Existing TTT methods often struggle to maintain performance when confronted with strong OOD data. In OWTTT, the focus has predominantly been on distinguishing between overall strong and weak OOD data. However, during the early stages of TTT, initial feature extraction is hampered by interference from strong OOD and corruptions, resulting in diminished contrast and premature classification of certain classes as strong OOD. To address this, we introduce Open World Dynamic Contrastive Learning (OWDCL), an innovative approach that utilizes contrastive learning to augment positive sample pairs. This strategy not only bolsters contrast in the early stages but also significantly enhances model robustness in subsequent stages. In comparison datasets, our OWDCL model has produced the most advanced performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09591v1-abstract-full').style.display = 'none'; document.getElementById('2409.09591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09406">arXiv:2409.09406</a> <span> [<a href="https://arxiv.org/pdf/2409.09406">pdf</a>, <a href="https://arxiv.org/format/2409.09406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Real-world Adversarial Defense against Patch Attacks based on Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingxing Wei</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+C">Caixin Kang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+S">Shouwei Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yubo Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09406v1-abstract-short" style="display: inline;"> Adversarial patches present significant challenges to the robustness of deep learning models, making the development of effective defenses become critical for real-world applications. This paper introduces DIFFender, a novel DIFfusion-based DeFender framework that leverages the power of a text-guided diffusion model to counter adversarial patch attacks. At the core of our approach is the discovery… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09406v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09406v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09406v1-abstract-full" style="display: none;"> Adversarial patches present significant challenges to the robustness of deep learning models, making the development of effective defenses become critical for real-world applications. This paper introduces DIFFender, a novel DIFfusion-based DeFender framework that leverages the power of a text-guided diffusion model to counter adversarial patch attacks. At the core of our approach is the discovery of the Adversarial Anomaly Perception (AAP) phenomenon, which enables the diffusion model to accurately detect and locate adversarial patches by analyzing distributional anomalies. DIFFender seamlessly integrates the tasks of patch localization and restoration within a unified diffusion model framework, enhancing defense efficacy through their close interaction. Additionally, DIFFender employs an efficient few-shot prompt-tuning algorithm, facilitating the adaptation of the pre-trained diffusion model to defense tasks without the need for extensive retraining. Our comprehensive evaluation, covering image classification and face recognition tasks, as well as real-world scenarios, demonstrates DIFFender's robust performance against adversarial attacks. The framework's versatility and generalizability across various settings, classifiers, and attack methodologies mark a significant advancement in adversarial patch defense strategies. Except for the popular visible domain, we have identified another advantage of DIFFender: its capability to easily expand into the infrared domain. Consequently, we demonstrate the good flexibility of DIFFender, which can defend against both infrared and visible adversarial patch attacks alternatively using a universal defense framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09406v1-abstract-full').style.display = 'none'; document.getElementById('2409.09406v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04837">arXiv:2409.04837</a> <span> [<a href="https://arxiv.org/pdf/2409.04837">pdf</a>, <a href="https://arxiv.org/format/2409.04837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Context-Aware Replanning with Pre-explored Semantic Map for Object Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ko%2C+P">Po-Chen Ko</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hung-Ting Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Ching-Yuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+J">Jia-Fong Yeh</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Min Sun</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W+H">Winston H. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04837v2-abstract-short" style="display: inline;"> Pre-explored Semantic Maps, constructed through prior exploration using visual language models (VLMs), have proven effective as foundational elements for training-free robotic applications. However, existing approaches assume the map's accuracy and do not provide effective mechanisms for revising decisions based on incorrect maps. To address this, we introduce Context-Aware Replanning (CARe), whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04837v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04837v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04837v2-abstract-full" style="display: none;"> Pre-explored Semantic Maps, constructed through prior exploration using visual language models (VLMs), have proven effective as foundational elements for training-free robotic applications. However, existing approaches assume the map's accuracy and do not provide effective mechanisms for revising decisions based on incorrect maps. To address this, we introduce Context-Aware Replanning (CARe), which estimates map uncertainty through confidence scores and multi-view consistency, enabling the agent to revise erroneous decisions stemming from inaccurate maps without requiring additional labels. We demonstrate the effectiveness of our proposed method by integrating it with two modern mapping backbones, VLMaps and OpenMask3D, and observe significant performance improvements in object navigation tasks. More details can be found on the project page: https://care-maps.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04837v2-abstract-full').style.display = 'none'; document.getElementById('2409.04837v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL 2024 camera ready. The first three authors contributed equally, and their order of authorship is interchangeable. Project page: https://care-maps.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01588">arXiv:2409.01588</a> <span> [<a href="https://arxiv.org/pdf/2409.01588">pdf</a>, <a href="https://arxiv.org/format/2409.01588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3678717.3691254">10.1145/3678717.3691254 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Large-scale Urban Facility Location Selection with Knowledge-informed Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongyuan Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yu Zheng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jingtao Ding</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Depeng Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01588v2-abstract-short" style="display: inline;"> The facility location problem (FLP) is a classical combinatorial optimization challenge aimed at strategically laying out facilities to maximize their accessibility. In this paper, we propose a reinforcement learning method tailored to solve large-scale urban FLP, capable of producing near-optimal solutions at superfast inference speed. We distill the essential swap operation from local search, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01588v2-abstract-full').style.display = 'inline'; document.getElementById('2409.01588v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01588v2-abstract-full" style="display: none;"> The facility location problem (FLP) is a classical combinatorial optimization challenge aimed at strategically laying out facilities to maximize their accessibility. In this paper, we propose a reinforcement learning method tailored to solve large-scale urban FLP, capable of producing near-optimal solutions at superfast inference speed. We distill the essential swap operation from local search, and simulate it by intelligently selecting edges on a graph of urban regions, guided by a knowledge-informed graph neural network, thus sidestepping the need for heavy computation of local search. Extensive experiments on four US cities with different geospatial conditions demonstrate that our approach can achieve comparable performance to commercial solvers with less than 5\% accessibility loss, while displaying up to 1000 times speedup. We deploy our model as an online geospatial application at https://huggingface.co/spaces/randommmm/MFLP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01588v2-abstract-full').style.display = 'none'; document.getElementById('2409.01588v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Sigspatial2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T20 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17443">arXiv:2408.17443</a> <span> [<a href="https://arxiv.org/pdf/2408.17443">pdf</a>, <a href="https://arxiv.org/format/2408.17443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HERMES: temporal-coHERent long-forM understanding with Episodes and Semantics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Faure%2C+G+J">Gueter Josmy Faure</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+J">Jia-Fong Yeh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min-Hung Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hung-Ting Su</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Shang-Hong Lai</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W+H">Winston H. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17443v3-abstract-short" style="display: inline;"> Existing research often treats long-form videos as extended short videos, leading to several limitations: inadequate capture of long-range dependencies, inefficient processing of redundant information, and failure to extract high-level semantic concepts. To address these issues, we propose a novel approach that more accurately reflects human cognition. This paper introduces HERMES: temporal-coHERe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17443v3-abstract-full').style.display = 'inline'; document.getElementById('2408.17443v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17443v3-abstract-full" style="display: none;"> Existing research often treats long-form videos as extended short videos, leading to several limitations: inadequate capture of long-range dependencies, inefficient processing of redundant information, and failure to extract high-level semantic concepts. To address these issues, we propose a novel approach that more accurately reflects human cognition. This paper introduces HERMES: temporal-coHERent long-forM understanding with Episodes and Semantics, a model that simulates episodic memory accumulation to capture action sequences and reinforces them with semantic knowledge dispersed throughout the video. Our work makes two key contributions: First, we develop an Episodic COmpressor (ECO) that efficiently aggregates crucial representations from micro to semi-macro levels, overcoming the challenge of long-range dependencies. Second, we propose a Semantics ReTRiever (SeTR) that enhances these aggregated representations with semantic information by focusing on the broader context, dramatically reducing feature dimensionality while preserving relevant macro-level information. This addresses the issues of redundancy and lack of high-level concept extraction. Extensive experiments demonstrate that HERMES achieves state-of-the-art performance across multiple long-video understanding benchmarks in both zero-shot and fully-supervised settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17443v3-abstract-full').style.display = 'none'; document.getElementById('2408.17443v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is an improved and expanded version of our EVAL-FoMo Workshop at ECCV'24 (v1 of this paper). Project page: https://joslefaure.github.io/assets/html/hermes.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17027">arXiv:2408.17027</a> <span> [<a href="https://arxiv.org/pdf/2408.17027">pdf</a>, <a href="https://arxiv.org/format/2408.17027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ConDense: Consistent 2D/3D Pre-training for Dense and Sparse Features from Multi-View Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoshuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Howard Zhou</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+S">Soham Ghosh</a>, <a href="/search/cs?searchtype=author&query=Gnanapragasam%2C+D">Danushen Gnanapragasam</a>, <a href="/search/cs?searchtype=author&query=Jampani%2C+V">Varun Jampani</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17027v1-abstract-short" style="display: inline;"> To advance the state of the art in the creation of 3D foundation models, this paper introduces the ConDense framework for 3D pre-training utilizing existing pre-trained 2D networks and large-scale multi-view datasets. We propose a novel 2D-3D joint training scheme to extract co-embedded 2D and 3D features in an end-to-end pipeline, where 2D-3D feature consistency is enforced through a volume rende… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17027v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17027v1-abstract-full" style="display: none;"> To advance the state of the art in the creation of 3D foundation models, this paper introduces the ConDense framework for 3D pre-training utilizing existing pre-trained 2D networks and large-scale multi-view datasets. We propose a novel 2D-3D joint training scheme to extract co-embedded 2D and 3D features in an end-to-end pipeline, where 2D-3D feature consistency is enforced through a volume rendering NeRF-like ray marching process. Using dense per pixel features we are able to 1) directly distill the learned priors from 2D models to 3D models and create useful 3D backbones, 2) extract more consistent and less noisy 2D features, 3) formulate a consistent embedding space where 2D, 3D, and other modalities of data (e.g., natural language prompts) can be jointly queried. Furthermore, besides dense features, ConDense can be trained to extract sparse features (e.g., key points), also with 2D-3D consistency -- condensing 3D NeRF representations into compact sets of decorated key points. We demonstrate that our pre-trained model provides good initialization for various 3D tasks including 3D classification and segmentation, outperforming other 3D pre-training methods by a significant margin. It also enables, by exploiting our sparse features, additional useful downstream tasks, such as matching 2D images to 3D scenes, detecting duplicate 3D scenes, and querying a repository of 3D scenes through natural language -- all quite efficiently and without any per-scene fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17027v1-abstract-full').style.display = 'none'; document.getElementById('2408.17027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16027">arXiv:2408.16027</a> <span> [<a href="https://arxiv.org/pdf/2408.16027">pdf</a>, <a href="https://arxiv.org/format/2408.16027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Toward Time-Continuous Data Inference in Sparse Urban CrowdSensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Ziyu Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Haoyang Su</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hanqi Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+E">En Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenbin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16027v1-abstract-short" style="display: inline;"> Mobile Crowd Sensing (MCS) is a promising paradigm that leverages mobile users and their smart portable devices to perform various real-world tasks. However, due to budget constraints and the inaccessibility of certain areas, Sparse MCS has emerged as a more practical alternative, collecting data from a limited number of target subareas and utilizing inference algorithms to complete the full sensi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16027v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16027v1-abstract-full" style="display: none;"> Mobile Crowd Sensing (MCS) is a promising paradigm that leverages mobile users and their smart portable devices to perform various real-world tasks. However, due to budget constraints and the inaccessibility of certain areas, Sparse MCS has emerged as a more practical alternative, collecting data from a limited number of target subareas and utilizing inference algorithms to complete the full sensing map. While existing approaches typically assume a time-discrete setting with data remaining constant within each sensing cycle, this simplification can introduce significant errors, especially when dealing with long cycles, as real-world sensing data often changes continuously. In this paper, we go from fine-grained completion, i.e., the subdivision of sensing cycles into minimal time units, towards a more accurate, time-continuous completion. We first introduce Deep Matrix Factorization (DMF) as a neural network-enabled framework and enhance it with a Recurrent Neural Network (RNN-DMF) to capture temporal correlations in these finer time slices. To further deal with the continuous data, we propose TIME-DMF, which captures temporal information across unequal intervals, enabling time-continuous completion. Additionally, we present the Query-Generate (Q-G) strategy within TIME-DMF to model the infinite states of continuous data. Extensive experiments across five types of sensing tasks demonstrate the effectiveness of our models and the advantages of time-continuous completion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16027v1-abstract-full').style.display = 'none'; document.getElementById('2408.16027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15503">arXiv:2408.15503</a> <span> [<a href="https://arxiv.org/pdf/2408.15503">pdf</a>, <a href="https://arxiv.org/format/2408.15503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoboSense: Large-scale Dataset and Benchmark for Egocentric Robot Perception and Navigation in Crowded and Unstructured Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Haisheng Su</a>, <a href="/search/cs?searchtype=author&query=Song%2C+F">Feixiang Song</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Cong Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15503v4-abstract-short" style="display: inline;"> Reliable embodied perception from an egocentric perspective is challenging yet essential for autonomous navigation technology of intelligent mobile agents. With the growing demand of social robotics, near-field scene understanding becomes an important research topic in the areas of egocentric perceptual tasks related to navigation in both crowded and unstructured environments. Due to the complexit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15503v4-abstract-full').style.display = 'inline'; document.getElementById('2408.15503v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15503v4-abstract-full" style="display: none;"> Reliable embodied perception from an egocentric perspective is challenging yet essential for autonomous navigation technology of intelligent mobile agents. With the growing demand of social robotics, near-field scene understanding becomes an important research topic in the areas of egocentric perceptual tasks related to navigation in both crowded and unstructured environments. Due to the complexity of environmental conditions and difficulty of surrounding obstacles owing to truncation and occlusion, the perception capability under this circumstance is still inferior. To further enhance the intelligence of mobile robots, in this paper, we setup an egocentric multi-sensor data collection platform based on 3 main types of sensors (Camera, LiDAR and Fisheye), which supports flexible sensor configurations to enable dynamic sight of view from ego-perspective, capturing either near or farther areas. Meanwhile, a large-scale multimodal dataset is constructed, named RoboSense, to facilitate egocentric robot perception. Specifically, RoboSense contains more than 133K synchronized data with 1.4M 3D bounding box and IDs annotated in the full $360^{\circ}$ view, forming 216K trajectories across 7.6K temporal sequences. It has $270\times$ and $18\times$ as many annotations of surrounding obstacles within near ranges as the previous datasets collected for autonomous driving scenarios such as KITTI and nuScenes. Moreover, we define a novel matching criterion for near-field 3D perception and prediction metrics. Based on RoboSense, we formulate 6 popular tasks to facilitate the future research development, where the detailed analysis as well as benchmarks are also provided accordingly. Data desensitization measures have been conducted for privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15503v4-abstract-full').style.display = 'none'; document.getElementById('2408.15503v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10198">arXiv:2408.10198</a> <span> [<a href="https://arxiv.org/pdf/2408.10198">pdf</a>, <a href="https://arxiv.org/format/2408.10198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> MeshFormer: High-Quality Mesh Generation with 3D-Guided Reconstruction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghua Liu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+C">Chong Zeng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinyue Wei</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ruoxi Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Linghao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoning Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoshuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+I">Isabella Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hongzhi Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10198v1-abstract-short" style="display: inline;"> Open-world 3D reconstruction models have recently garnered significant attention. However, without sufficient 3D inductive bias, existing methods typically entail expensive training costs and struggle to extract high-quality 3D meshes. In this work, we introduce MeshFormer, a sparse-view reconstruction model that explicitly leverages 3D native structure, input guidance, and training supervision. S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10198v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10198v1-abstract-full" style="display: none;"> Open-world 3D reconstruction models have recently garnered significant attention. However, without sufficient 3D inductive bias, existing methods typically entail expensive training costs and struggle to extract high-quality 3D meshes. In this work, we introduce MeshFormer, a sparse-view reconstruction model that explicitly leverages 3D native structure, input guidance, and training supervision. Specifically, instead of using a triplane representation, we store features in 3D sparse voxels and combine transformers with 3D convolutions to leverage an explicit 3D structure and projective bias. In addition to sparse-view RGB input, we require the network to take input and generate corresponding normal maps. The input normal maps can be predicted by 2D diffusion models, significantly aiding in the guidance and refinement of the geometry's learning. Moreover, by combining Signed Distance Function (SDF) supervision with surface rendering, we directly learn to generate high-quality meshes without the need for complex multi-stage training processes. By incorporating these explicit 3D biases, MeshFormer can be trained efficiently and deliver high-quality textured meshes with fine-grained geometric details. It can also be integrated with 2D diffusion models to enable fast single-image-to-3D and text-to-3D tasks. Project page: https://meshformer3d.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10198v1-abstract-full').style.display = 'none'; document.getElementById('2408.10198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10195">arXiv:2408.10195</a> <span> [<a href="https://arxiv.org/pdf/2408.10195">pdf</a>, <a href="https://arxiv.org/format/2408.10195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> SpaRP: Fast 3D Object Reconstruction and Pose Estimation from Sparse Views </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Linghao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yulin Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ruoxi Shi</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghua Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10195v1-abstract-short" style="display: inline;"> Open-world 3D generation has recently attracted considerable attention. While many single-image-to-3D methods have yielded visually appealing outcomes, they often lack sufficient controllability and tend to produce hallucinated regions that may not align with users' expectations. In this paper, we explore an important scenario in which the input consists of one or a few unposed 2D images of a sing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10195v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10195v1-abstract-full" style="display: none;"> Open-world 3D generation has recently attracted considerable attention. While many single-image-to-3D methods have yielded visually appealing outcomes, they often lack sufficient controllability and tend to produce hallucinated regions that may not align with users' expectations. In this paper, we explore an important scenario in which the input consists of one or a few unposed 2D images of a single object, with little or no overlap. We propose a novel method, SpaRP, to reconstruct a 3D textured mesh and estimate the relative camera poses for these sparse-view images. SpaRP distills knowledge from 2D diffusion models and finetunes them to implicitly deduce the 3D spatial relationships between the sparse views. The diffusion model is trained to jointly predict surrogate representations for camera poses and multi-view images of the object under known poses, integrating all information from the input sparse views. These predictions are then leveraged to accomplish 3D reconstruction and pose estimation, and the reconstructed 3D model can be used to further refine the camera poses of input views. Through extensive experiments on three datasets, we demonstrate that our method not only significantly outperforms baseline methods in terms of 3D reconstruction quality and pose prediction accuracy but also exhibits strong efficiency. It requires only about 20 seconds to produce a textured mesh and camera poses for the input views. Project page: https://chaoxu.xyz/sparp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10195v1-abstract-full').style.display = 'none'; document.getElementById('2408.10195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09962">arXiv:2408.09962</a> <span> [<a href="https://arxiv.org/pdf/2408.09962">pdf</a>, <a href="https://arxiv.org/format/2408.09962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Validation of the Results of Cross-chain Smart Contract Based on Confirmation Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Hong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09962v1-abstract-short" style="display: inline;"> Smart contracts are widely utilized in cross-chain interactions, where their results are transmitted from one blockchain (the producer blockchain) to another (the consumer blockchain). Unfortunately, the consumer blockchain often accepts these results without executing the smart contracts for validation, posing potential security risks. To address this, we propose a method for validating cross-cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09962v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09962v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09962v1-abstract-full" style="display: none;"> Smart contracts are widely utilized in cross-chain interactions, where their results are transmitted from one blockchain (the producer blockchain) to another (the consumer blockchain). Unfortunately, the consumer blockchain often accepts these results without executing the smart contracts for validation, posing potential security risks. To address this, we propose a method for validating cross-chain smart contract results. Our approach emphasizes consumer blockchain execution of cross-chain smart contracts of producer blockchain, allowing comparison of results with the transmitted ones to detect potential discrepancies and ensure data integrity during cross-chain data dissemination. Additionally, we introduce the confirmation with proof method, which involves incorporating the chain of blocks and relevant cross-chain smart contract data from the producer blockchain into the consumer blockchain as evidence (or proof), establishing a unified and secure perspective of cross-chain smart contract results. Our verification results highlight the feasibility of cross-chain validation at the smart contract level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09962v1-abstract-full').style.display = 'none'; document.getElementById('2408.09962v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09958">arXiv:2408.09958</a> <span> [<a href="https://arxiv.org/pdf/2408.09958">pdf</a>, <a href="https://arxiv.org/format/2408.09958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AdaResNet: Enhancing Residual Networks with Dynamic Weight Adjustment for Improved Feature Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Hong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09958v1-abstract-short" style="display: inline;"> In very deep neural networks, gradients can become extremely small during backpropagation, making it challenging to train the early layers. ResNet (Residual Network) addresses this issue by enabling gradients to flow directly through the network via skip connections, facilitating the training of much deeper networks. However, in these skip connections, the input ipd is directly added to the transf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09958v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09958v1-abstract-full" style="display: none;"> In very deep neural networks, gradients can become extremely small during backpropagation, making it challenging to train the early layers. ResNet (Residual Network) addresses this issue by enabling gradients to flow directly through the network via skip connections, facilitating the training of much deeper networks. However, in these skip connections, the input ipd is directly added to the transformed data tfd, treating ipd and tfd equally, without adapting to different scenarios. In this paper, we propose AdaResNet (Auto-Adapting Residual Network), which automatically adjusts the ratio between ipd and tfd based on the training data. We introduce a variable, weight}_{tfd}^{ipd, to represent this ratio. This variable is dynamically adjusted during backpropagation, allowing it to adapt to the training data rather than remaining fixed. Experimental results demonstrate that AdaResNet achieves a maximum accuracy improvement of over 50\% compared to traditional ResNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09958v1-abstract-full').style.display = 'none'; document.getElementById('2408.09958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05671">arXiv:2408.05671</a> <span> [<a href="https://arxiv.org/pdf/2408.05671">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Research on Heterogeneous Computation Resource Allocation based on Data-driven Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xirui Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xiaowei Cai</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Honghua Su</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Changsong Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05671v1-abstract-short" style="display: inline;"> The rapid development of the mobile Internet and the Internet of Things is leading to a diversification of user devices and the emergence of new mobile applications on a regular basis. Such applications include those that are computationally intensive, such as pattern recognition, interactive gaming, virtual reality, and augmented reality. However, the computing and energy resources available on t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05671v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05671v1-abstract-full" style="display: none;"> The rapid development of the mobile Internet and the Internet of Things is leading to a diversification of user devices and the emergence of new mobile applications on a regular basis. Such applications include those that are computationally intensive, such as pattern recognition, interactive gaming, virtual reality, and augmented reality. However, the computing and energy resources available on the user's equipment are limited, which presents a challenge in effectively supporting such demanding applications. In this work, we propose a heterogeneous computing resource allocation model based on a data-driven approach. The model first collects and analyzes historical workload data at scale, extracts key features, and builds a detailed data set. Then, a data-driven deep neural network is used to predict future resource requirements. Based on the prediction results, the model adopts a dynamic adjustment and optimization resource allocation strategy. This strategy not only fully considers the characteristics of different computing resources, but also accurately matches the requirements of various tasks, and realizes dynamic and flexible resource allocation, thereby greatly improving the overall performance and resource utilization of the system. Experimental results show that the proposed method is significantly better than the traditional resource allocation method in a variety of scenarios, demonstrating its excellent accuracy and adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05671v1-abstract-full').style.display = 'none'; document.getElementById('2408.05671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03907">arXiv:2408.03907</a> <span> [<a href="https://arxiv.org/pdf/2408.03907">pdf</a>, <a href="https://arxiv.org/format/2408.03907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Decoding Biases: Automated Methods and LLM Judges for Gender Bias Detection in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+S+H">Shachi H Kumar</a>, <a href="/search/cs?searchtype=author&query=Sahay%2C+S">Saurav Sahay</a>, <a href="/search/cs?searchtype=author&query=Mazumder%2C+S">Sahisnu Mazumder</a>, <a href="/search/cs?searchtype=author&query=Okur%2C+E">Eda Okur</a>, <a href="/search/cs?searchtype=author&query=Manuvinakurike%2C+R">Ramesh Manuvinakurike</a>, <a href="/search/cs?searchtype=author&query=Beckage%2C+N">Nicole Beckage</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hsuan Su</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Nachman%2C+L">Lama Nachman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03907v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have excelled at language understanding and generating human-level text. However, even with supervised training and human alignment, these LLMs are susceptible to adversarial attacks where malicious users can prompt the model to generate undesirable text. LLMs also inherently encode potential biases that can cause various harmful effects during interactions. Bias evalu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03907v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03907v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have excelled at language understanding and generating human-level text. However, even with supervised training and human alignment, these LLMs are susceptible to adversarial attacks where malicious users can prompt the model to generate undesirable text. LLMs also inherently encode potential biases that can cause various harmful effects during interactions. Bias evaluation metrics lack standards as well as consensus and existing methods often rely on human-generated templates and annotations which are expensive and labor intensive. In this work, we train models to automatically create adversarial prompts to elicit biased responses from target LLMs. We present LLM- based bias evaluation metrics and also analyze several existing automatic evaluation methods and metrics. We analyze the various nuances of model responses, identify the strengths and weaknesses of model families, and assess where evaluation methods fall short. We compare these metrics to human evaluation and validate that the LLM-as-a-Judge metric aligns with human judgement on bias in response generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03907v1-abstract-full').style.display = 'none'; document.getElementById('2408.03907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages paper content, 17 pages of appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18443">arXiv:2407.18443</a> <span> [<a href="https://arxiv.org/pdf/2407.18443">pdf</a>, <a href="https://arxiv.org/format/2407.18443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HybridDepth: Robust Metric Depth Fusion by Leveraging Depth from Focus and Single-Image Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ganj%2C+A">Ashkan Ganj</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Tian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18443v2-abstract-short" style="display: inline;"> We propose HYBRIDDEPTH, a robust depth estimation pipeline that addresses key challenges in depth estimation,including scale ambiguity, hardware heterogeneity, and generalizability. HYBRIDDEPTH leverages focal stack, data conveniently accessible in common mobile devices, to produce accurate metric depth maps. By incorporating depth priors afforded by recent advances in singleimage depth estimation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18443v2-abstract-full').style.display = 'inline'; document.getElementById('2407.18443v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18443v2-abstract-full" style="display: none;"> We propose HYBRIDDEPTH, a robust depth estimation pipeline that addresses key challenges in depth estimation,including scale ambiguity, hardware heterogeneity, and generalizability. HYBRIDDEPTH leverages focal stack, data conveniently accessible in common mobile devices, to produce accurate metric depth maps. By incorporating depth priors afforded by recent advances in singleimage depth estimation, our model achieves a higher level of structural detail compared to existing methods. We test our pipeline as an end-to-end system, with a newly developed mobile client to capture focal stacks, which are then sent to a GPU-powered server for depth estimation. Comprehensive quantitative and qualitative analyses demonstrate that HYBRIDDEPTH outperforms state-of-the-art(SOTA) models on common datasets such as DDFF12 and NYU Depth V2. HYBRIDDEPTH also shows strong zero-shot generalization. When trained on NYU Depth V2, HYBRIDDEPTH surpasses SOTA models in zero-shot performance on ARKitScenes and delivers more structurally accurate depth maps on Mobile Depth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18443v2-abstract-full').style.display = 'none'; document.getElementById('2407.18443v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12883">arXiv:2407.12883</a> <span> [<a href="https://arxiv.org/pdf/2407.12883">pdf</a>, <a href="https://arxiv.org/format/2407.12883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongjin Su</a>, <a href="/search/cs?searchtype=author&query=Yen%2C+H">Howard Yen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han-yu Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haisu Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Q">Quan Shi</a>, <a href="/search/cs?searchtype=author&query=Siegel%2C+Z+S">Zachary S. Siegel</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Michael Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J">Jinsung Yoon</a>, <a href="/search/cs?searchtype=author&query=Arik%2C+S+O">Sercan O. Arik</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12883v3-abstract-short" style="display: inline;"> Existing retrieval benchmarks primarily consist of information-seeking queries (e.g., aggregated questions from search engines) where keyword or semantic-based retrieval is usually sufficient. However, many complex real-world queries require in-depth reasoning to identify relevant documents that go beyond surface form matching. For example, finding documentation for a coding question requires unde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12883v3-abstract-full').style.display = 'inline'; document.getElementById('2407.12883v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12883v3-abstract-full" style="display: none;"> Existing retrieval benchmarks primarily consist of information-seeking queries (e.g., aggregated questions from search engines) where keyword or semantic-based retrieval is usually sufficient. However, many complex real-world queries require in-depth reasoning to identify relevant documents that go beyond surface form matching. For example, finding documentation for a coding question requires understanding the logic and syntax of the functions involved. To better benchmark retrieval on such challenging queries, we introduce BRIGHT, the first text retrieval benchmark that requires intensive reasoning to retrieve relevant documents. Our dataset consists of 1,384 real-world queries spanning diverse domains, such as economics, psychology, mathematics, and coding. These queries are drawn from naturally occurring and carefully curated human data. Extensive evaluation reveals that even state-of-the-art retrieval models perform poorly on BRIGHT. The leading model on the MTEB leaderboard (Muennighoff et al., 2023), which achieves a score of 59.0 nDCG@10, produces a score of nDCG@10 of 18.3 on BRIGHT. We show that incorporating explicit reasoning about the query improves retrieval performance by up to 12.2 points. Moreover, incorporating retrieved documents from the top-performing retriever boosts question-answering performance by over 6.6 points. We believe that BRIGHT paves the way for future research on retrieval systems in more realistic and challenging settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12883v3-abstract-full').style.display = 'none'; document.getElementById('2407.12883v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Su%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository