Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,700 results for author: <span class="mathjax">Xue, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xue%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xue, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xue%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xue, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xue%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14314">arXiv:2411.14314</a> <span> [<a href="https://arxiv.org/pdf/2411.14314">pdf</a>, <a href="https://arxiv.org/format/2411.14314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Switching Graph Matrix Norm Bounds: from i.i.d. to Random Regular Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jeff Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14314v1-abstract-short" style="display: inline;"> In this work, we give novel spectral norm bounds for graph matrix on inputs being random regular graphs. Graph matrix is a family of random matrices with entries given by polynomial functions of the underlying input. These matrices have been known to be the backbone for the analysis of various average-case algorithms and hardness. Previous investigations of such matrices are largely restricted to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14314v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14314v1-abstract-full" style="display: none;"> In this work, we give novel spectral norm bounds for graph matrix on inputs being random regular graphs. Graph matrix is a family of random matrices with entries given by polynomial functions of the underlying input. These matrices have been known to be the backbone for the analysis of various average-case algorithms and hardness. Previous investigations of such matrices are largely restricted to the \Erdos-\Renyi model, and tight matrix norm bounds on regular graphs are only known for specific examples. We unite these two lines of investigations, and give the first result departing from the \Erdos-\Renyi setting in the full generality of graph matrices. We believe our norm bound result would enable a simple transfer of spectral analysis for average-case algorithms and hardness between these two distributions of random graphs. As an application of our spectral norm bounds, we show that higher-degree Sum-of-Squares lower bounds for the independent set problem on \Erdos-\Renyi random graphs can be switched into lower bounds on random $d$-regular graphs. Our result is the first to address the general open question of analyzing higher-degree Sum-of-Squares on random regular graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14314v1-abstract-full').style.display = 'none'; document.getElementById('2411.14314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14169">arXiv:2411.14169</a> <span> [<a href="https://arxiv.org/pdf/2411.14169">pdf</a>, <a href="https://arxiv.org/format/2411.14169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatiotemporal Decoupling for Efficient Vision-Based Occupancy Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingyi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junyi Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jintao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14169v1-abstract-short" style="display: inline;"> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14169v1-abstract-full" style="display: none;"> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to neglecting the bias and uneven distribution of changing occupancy states in both space and time. In this paper, we propose a novel spatiotemporal decoupling vision-based paradigm to explicitly tackle the bias and achieve both effective and efficient 3D OCF. To tackle spatial bias in empty areas, we introduce a novel spatial representation that decouples the conventional dense 3D format into 2D bird's-eye view (BEV) occupancy with corresponding height values, enabling 3D OCF derived only from 2D predictions thus enhancing efficiency. To reduce temporal bias on static voxels, we design temporal decoupling to improve end-to-end OCF by temporally associating instances via predicted flows. We develop an efficient multi-head network EfficientOCF to achieve 3D OCF with our devised spatiotemporally decoupled representation. A new metric, conditional IoU (C-IoU), is also introduced to provide a robust 3D OCF performance assessment, especially in datasets with missing or incomplete annotations. The experimental results demonstrate that EfficientOCF surpasses existing baseline methods on accuracy and efficiency, achieving state-of-the-art performance with a fast inference time of 82.33ms with a single GPU. Our code will be released as open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14169v1-abstract-full').style.display = 'none'; document.getElementById('2411.14169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13628">arXiv:2411.13628</a> <span> [<a href="https://arxiv.org/pdf/2411.13628">pdf</a>, <a href="https://arxiv.org/format/2411.13628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MambaDETR: Query-based Temporal Modeling using State Space Model for Multi-View 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ning%2C+T">Tong Ning</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xirui Jiang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jian Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13628v1-abstract-short" style="display: inline;"> Utilizing temporal information to improve the performance of 3D detection has made great progress recently in the field of autonomous driving. Traditional transformer-based temporal fusion methods suffer from quadratic computational cost and information decay as the length of the frame sequence increases. In this paper, we propose a novel method called MambaDETR, whose main idea is to implement te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13628v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13628v1-abstract-full" style="display: none;"> Utilizing temporal information to improve the performance of 3D detection has made great progress recently in the field of autonomous driving. Traditional transformer-based temporal fusion methods suffer from quadratic computational cost and information decay as the length of the frame sequence increases. In this paper, we propose a novel method called MambaDETR, whose main idea is to implement temporal fusion in the efficient state space. Moreover, we design a Motion Elimination module to remove the relatively static objects for temporal fusion. On the standard nuScenes benchmark, our proposed MambaDETR achieves remarkable result in the 3D object detection task, exhibiting state-of-the-art performance among existing temporal fusion methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13628v1-abstract-full').style.display = 'none'; document.getElementById('2411.13628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13577">arXiv:2411.13577</a> <span> [<a href="https://arxiv.org/pdf/2411.13577">pdf</a>, <a href="https://arxiv.org/format/2411.13577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> WavChat: A Survey of Spoken Dialogue Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingyu Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanting Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingzhen He</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Y">Yunfei Chu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13577v1-abstract-short" style="display: inline;"> Recent advancements in spoken dialogue models, exemplified by systems like GPT-4o, have captured significant attention in the speech domain. Compared to traditional three-tier cascaded spoken dialogue models that comprise speech recognition (ASR), large language models (LLMs), and text-to-speech (TTS), modern spoken dialogue models exhibit greater intelligence. These advanced spoken dialogue model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13577v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13577v1-abstract-full" style="display: none;"> Recent advancements in spoken dialogue models, exemplified by systems like GPT-4o, have captured significant attention in the speech domain. Compared to traditional three-tier cascaded spoken dialogue models that comprise speech recognition (ASR), large language models (LLMs), and text-to-speech (TTS), modern spoken dialogue models exhibit greater intelligence. These advanced spoken dialogue models not only comprehend audio, music, and other speech-related features, but also capture stylistic and timbral characteristics in speech. Moreover, they generate high-quality, multi-turn speech responses with low latency, enabling real-time interaction through simultaneous listening and speaking capability. Despite the progress in spoken dialogue systems, there is a lack of comprehensive surveys that systematically organize and analyze these systems and the underlying technologies. To address this, we have first compiled existing spoken dialogue systems in the chronological order and categorized them into the cascaded and end-to-end paradigms. We then provide an in-depth overview of the core technologies in spoken dialogue models, covering aspects such as speech representation, training paradigm, streaming, duplex, and interaction capabilities. Each section discusses the limitations of these technologies and outlines considerations for future research. Additionally, we present a thorough review of relevant datasets, evaluation metrics, and benchmarks from the perspectives of training and evaluating spoken dialogue systems. We hope this survey will contribute to advancing both academic research and industrial applications in the field of spoken dialogue systems. The related material is available at https://github.com/jishengpeng/WavChat. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13577v1-abstract-full').style.display = 'none'; document.getElementById('2411.13577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 papes, working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13045">arXiv:2411.13045</a> <span> [<a href="https://arxiv.org/pdf/2411.13045">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Explainable LLM-driven Multi-dimensional Distillation for E-Commerce Relevance Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Gang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Ximing Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chenji Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianshu Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13045v1-abstract-short" style="display: inline;"> Effective query-item relevance modeling is pivotal for enhancing user experience and safeguarding user satisfaction in e-commerce search systems. Recently, benefiting from the vast inherent knowledge, Large Language Model (LLM) approach demonstrates strong performance and long-tail generalization ability compared with previous neural-based specialized relevance learning methods. Though promising,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13045v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13045v1-abstract-full" style="display: none;"> Effective query-item relevance modeling is pivotal for enhancing user experience and safeguarding user satisfaction in e-commerce search systems. Recently, benefiting from the vast inherent knowledge, Large Language Model (LLM) approach demonstrates strong performance and long-tail generalization ability compared with previous neural-based specialized relevance learning methods. Though promising, current LLM-based methods encounter the following inadequacies in practice: First, the massive parameters and computational demands make it difficult to be deployed online. Second, distilling LLM models to online models is a feasible direction, but the LLM relevance modeling is a black box, and its rich intrinsic knowledge is difficult to extract and apply online. To improve the interpretability of LLM and boost the performance of online relevance models via LLM, we propose an Explainable LLM-driven Multi-dimensional Distillation framework for e-commerce relevance learning, which comprises two core components: (1) An Explainable LLM for relevance modeling (ELLM-rele), which decomposes the relevance learning into intermediate steps and models relevance learning as a Chain-of-Thought (CoT) reasoning, thereby enhancing both interpretability and performance of LLM. (2) A Multi-dimensional Knowledge Distillation (MKD) architecture that transfers the knowledge of ELLM-rele to current deployable interaction-based and representation-based student models from both the relevance score distribution and CoT reasoning aspects. Through distilling the probabilistic and CoT reasoning knowledge, MKD improves both the semantic interaction and long-tail generalization abilities of student models. Extensive offline evaluations and online experiments on Taobao search ad scene demonstrate that our proposed framework significantly enhances e-commerce relevance learning performance and user experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13045v1-abstract-full').style.display = 'none'; document.getElementById('2411.13045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12503">arXiv:2411.12503</a> <span> [<a href="https://arxiv.org/pdf/2411.12503">pdf</a>, <a href="https://arxiv.org/format/2411.12503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ManiSkill-ViTac 2025: Challenge on Manipulation Skill Learning With Vision and Tactile Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanyu Li</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+R">Renjun Dang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Kasaei%2C+H">Hamidreza Kasaei</a>, <a href="/search/cs?searchtype=author&query=Calandra%2C+R">Roberto Calandra</a>, <a href="/search/cs?searchtype=author&query=Lepora%2C+N">Nathan Lepora</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12503v1-abstract-short" style="display: inline;"> This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on learning contact-rich manipulation skills using both tactile and visual sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 independent tracks: tactile manipulation, tactile-vision fusion manipulation, and tactile sensor structure design. The challenge aims to push the boundaries of robotic manipul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12503v1-abstract-full" style="display: none;"> This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on learning contact-rich manipulation skills using both tactile and visual sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 independent tracks: tactile manipulation, tactile-vision fusion manipulation, and tactile sensor structure design. The challenge aims to push the boundaries of robotic manipulation skills, emphasizing the integration of tactile and visual data to enhance performance in complex, real-world tasks. Participants will be evaluated using standardized metrics across both simulated and real-world environments, spurring innovations in sensor design and significantly advancing the field of vision-tactile fusion in robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12503v1-abstract-full').style.display = 'none'; document.getElementById('2411.12503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Challenge webpage: https://ai-workshops.github.io/maniskill-vitac-challenge-2025/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11871">arXiv:2411.11871</a> <span> [<a href="https://arxiv.org/pdf/2411.11871">pdf</a>, <a href="https://arxiv.org/format/2411.11871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> MultiBalance: Multi-Objective Gradient Balancing in Industrial-Scale Multi-Task Recommendation System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yun He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuxing Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiayi Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Renqin Cai</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yiling You</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jennifer Cao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minhui Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Rong Jin</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sem Park</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xue Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11871v1-abstract-short" style="display: inline;"> In industrial recommendation systems, multi-task learning (learning multiple tasks simultaneously on a single model) is a predominant approach to save training/serving resources and improve recommendation performance via knowledge transfer between the joint learning tasks. However, multi-task learning often suffers from negative transfer: one or several tasks are less optimized than training them… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11871v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11871v1-abstract-full" style="display: none;"> In industrial recommendation systems, multi-task learning (learning multiple tasks simultaneously on a single model) is a predominant approach to save training/serving resources and improve recommendation performance via knowledge transfer between the joint learning tasks. However, multi-task learning often suffers from negative transfer: one or several tasks are less optimized than training them separately. To carefully balance the optimization, we propose a gradient balancing approach called MultiBalance, which is suitable for industrial-scale multi-task recommendation systems. It balances the per-task gradients to alleviate the negative transfer, while saving the huge cost for grid search or manual explorations for appropriate task weights. Moreover, compared with prior work that normally balance the per-task gradients of shared parameters, MultiBalance is more efficient since only requiring to access per-task gradients with respect to the shared feature representations. We conduct experiments on Meta's large-scale ads and feeds multi-task recommendation system, and observe that MultiBalance achieves significant gains (e.g., 0.738% improvement for normalized entropy (NE)) with neutral training cost in Queries Per Second (QPS), which is significantly more efficient than prior methods that balance per-task gradients of shared parameters with 70~80% QPS degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11871v1-abstract-full').style.display = 'none'; document.getElementById('2411.11871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11197">arXiv:2411.11197</a> <span> [<a href="https://arxiv.org/pdf/2411.11197">pdf</a>, <a href="https://arxiv.org/format/2411.11197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Stealing Training Graphs from Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+M">Minhua Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+E">Enyan Dai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Junjie Xu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jinyuan Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11197v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have shown promising results in modeling graphs in various tasks. The training of GNNs, especially on specialized tasks such as bioinformatics, demands extensive expert annotations, which are expensive and usually contain sensitive information of data providers. The trained GNN models are often shared for deployment in the real world. As neural networks can memorize th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11197v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11197v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have shown promising results in modeling graphs in various tasks. The training of GNNs, especially on specialized tasks such as bioinformatics, demands extensive expert annotations, which are expensive and usually contain sensitive information of data providers. The trained GNN models are often shared for deployment in the real world. As neural networks can memorize the training samples, the model parameters of GNNs have a high risk of leaking private training data. Our theoretical analysis shows the strong connections between trained GNN parameters and the training graphs used, confirming the training graph leakage issue. However, explorations into training data leakage from trained GNNs are rather limited. Therefore, we investigate a novel problem of stealing graphs from trained GNNs. To obtain high-quality graphs that resemble the target training set, a graph diffusion model with diffusion noise optimization is deployed as a graph generator. Furthermore, we propose a selection method that effectively leverages GNN model parameters to identify training graphs from samples generated by the graph diffusion model. Extensive experiments on real-world datasets demonstrate the effectiveness of the proposed framework in stealing training graphs from the trained GNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11197v1-abstract-full').style.display = 'none'; document.getElementById('2411.11197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be appeared in KDD 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10249">arXiv:2411.10249</a> <span> [<a href="https://arxiv.org/pdf/2411.10249">pdf</a>, <a href="https://arxiv.org/format/2411.10249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> How the interplay between power concentration, competition, and propagation affects the resource efficiency of distributed ledgers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Barucca%2C+P">Paolo Barucca</a>, <a href="/search/cs?searchtype=author&query=Campajola%2C+C">Carlo Campajola</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahua Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10249v1-abstract-short" style="display: inline;"> Forks in the Bitcoin network result from the natural competition in the blockchain's Proof-of-Work consensus protocol. Their frequency is a critical indicator for the efficiency of a distributed ledger as they can contribute to resource waste and network insecurity. We introduce a model for the estimation of natural fork rates in a network of heterogeneous miners as a function of their number, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10249v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10249v1-abstract-full" style="display: none;"> Forks in the Bitcoin network result from the natural competition in the blockchain's Proof-of-Work consensus protocol. Their frequency is a critical indicator for the efficiency of a distributed ledger as they can contribute to resource waste and network insecurity. We introduce a model for the estimation of natural fork rates in a network of heterogeneous miners as a function of their number, the distribution of hash rates and the block propagation time over the peer-to-peer infrastructure. Despite relatively simplistic assumptions, such as zero propagation delay within mining pools, the model predicts fork rates which are comparable with the empirical stale blocks rate. In the past decade, we observe a reduction in the number of mining pools approximately by a factor 3, and quantify its consequences for the fork rate, whilst showing the emergence of a truncated power-law distribution in hash rates, justified by a rich-get-richer effect constrained by global energy supply limits. We demonstrate, both empirically and with the aid of our quantitative model, that the ratio between the block propagation time and the mining time is a sufficiently accurate estimator of the fork rate, but also quantify its dependence on the heterogeneity of miner activities. We provide empirical and theoretical evidence that both hash rate concentration and lower block propagation time reduce fork rates in distributed ledgers. Our work introduces a robust mathematical setting for investigating power concentration and competition on a distributed network, for interpreting discrepancies in fork rates -- for example caused by selfish mining practices and asymmetric propagation times -- thus providing an effective tool for designing future and alternative scenarios for existing and new blockchain distributed mining systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10249v1-abstract-full').style.display = 'none'; document.getElementById('2411.10249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09852">arXiv:2411.09852</a> <span> [<a href="https://arxiv.org/pdf/2411.09852">pdf</a>, <a href="https://arxiv.org/format/2411.09852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InterFormer: Towards Effective Heterogeneous Interaction Learning for Click-Through Rate Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhichen Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaolong Liu</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+M">Mengyue Hang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qinghai Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+Y">Yichen Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Laming Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yujia Hao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiaqi Xu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jade Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Buyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wei Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Siyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen-Yen Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yiping Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huayu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09852v1-abstract-short" style="display: inline;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09852v1-abstract-full" style="display: none;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. However, most of the existing methods suffer from two fundamental limitations, including (1) insufficient inter-mode interaction due to the unidirectional information flow between modes, and (2) aggressive information aggregation caused by early summarization, resulting in excessive information loss. To address the above limitations, we propose a novel module named InterFormer to learn heterogeneous information interaction in an interleaving style. To achieve better interaction learning, InterFormer enables bidirectional information flow for mutually beneficial learning across different modes. To avoid aggressive information aggregation, we retain complete information in each data mode and use a separate bridging arch for effective information selection and summarization. Our proposed InterFormer achieves state-of-the-art performance on three public datasets and a large-scale industrial dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v1-abstract-full').style.display = 'none'; document.getElementById('2411.09852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08999">arXiv:2411.08999</a> <span> [<a href="https://arxiv.org/pdf/2411.08999">pdf</a>, <a href="https://arxiv.org/format/2411.08999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Learning-Based Control Barrier Function with Provably Safe Guarantees: Reducing Conservatism with Heading-Aware Safety Margin </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jianye Xu</a>, <a href="/search/cs?searchtype=author&query=Alrifaee%2C+B">Bassam Alrifaee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08999v1-abstract-short" style="display: inline;"> We propose a learning-based Control Barrier Function (CBF) to reduce conservatism in collision avoidance of car-like robots. Traditional CBFs often use Euclidean distance between robots' centers as safety margin, neglecting headings and simplifying geometries to circles. While this ensures smooth, differentiable safety functions required by CBFs, it can be overly conservative in tight environments… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08999v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08999v1-abstract-full" style="display: none;"> We propose a learning-based Control Barrier Function (CBF) to reduce conservatism in collision avoidance of car-like robots. Traditional CBFs often use Euclidean distance between robots' centers as safety margin, neglecting headings and simplifying geometries to circles. While this ensures smooth, differentiable safety functions required by CBFs, it can be overly conservative in tight environments. To address this limitation, we design a heading-aware safety margin that accounts for the robots' orientations, enabling a less conservative and more accurate estimation of safe regions. Since the function computing this safety margin is non-differentiable, we approximate it with a neural network to ensure differentiability and facilitate integration with CBFs. We describe how we achieve bounded learning error and incorporate the upper bound into the CBF to provide formal safety guarantees through forward invariance. We show that our CBF is a high-order CBF with relative degree two for a system with two robots whose dynamics are modeled by the nonlinear kinematic bicycle model. Experimental results in overtaking and bypassing scenarios reveal a 33.5 % reduction in conservatism compared to traditional methods, while maintaining safety. Code: https://github.com/bassamlab/sigmarl <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08999v1-abstract-full').style.display = 'none'; document.getElementById('2411.08999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08320">arXiv:2411.08320</a> <span> [<a href="https://arxiv.org/pdf/2411.08320">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Responsible AI in Construction Safety: Systematic Evaluation of Large Language Models and Prompt Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sammour%2C+F">Farouq Sammour</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jia Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mo Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08320v1-abstract-short" style="display: inline;"> Construction remains one of the most hazardous sectors. Recent advancements in AI, particularly Large Language Models (LLMs), offer promising opportunities for enhancing workplace safety. However, responsible integration of LLMs requires systematic evaluation, as deploying them without understanding their capabilities and limitations risks generating inaccurate information, fostering misplaced con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08320v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08320v1-abstract-full" style="display: none;"> Construction remains one of the most hazardous sectors. Recent advancements in AI, particularly Large Language Models (LLMs), offer promising opportunities for enhancing workplace safety. However, responsible integration of LLMs requires systematic evaluation, as deploying them without understanding their capabilities and limitations risks generating inaccurate information, fostering misplaced confidence, and compromising worker safety. This study evaluates the performance of two widely used LLMs, GPT-3.5 and GPT-4o, across three standardized exams administered by the Board of Certified Safety Professionals (BCSP). Using 385 questions spanning seven safety knowledge areas, the study analyzes the models' accuracy, consistency, and reliability. Results show that both models consistently exceed the BCSP benchmark, with GPT-4o achieving an accuracy rate of 84.6% and GPT-3.5 reaching 73.8%. Both models demonstrate strengths in safety management systems and hazard identification and control, but exhibit weaknesses in science, mathematics, emergency response, and fire prevention. An error analysis identifies four primary limitations affecting LLM performance: lack of knowledge, reasoning flaws, memory issues, and calculation errors. Our study also highlights the impact of prompt engineering strategies, with variations in accuracy reaching 13.5% for GPT-3.5 and 7.9% for GPT-4o. However, no single prompt configuration proves universally effective. This research advances knowledge in three ways: by identifying areas where LLMs can support safety practices and where human oversight remains essential, by offering practical insights into improving LLM implementation through prompt engineering, and by providing evidence-based direction for future research and development. These contributions support the responsible integration of AI in construction safety management toward achieving zero injuries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08320v1-abstract-full').style.display = 'none'; document.getElementById('2411.08320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07387">arXiv:2411.07387</a> <span> [<a href="https://arxiv.org/pdf/2411.07387">pdf</a>, <a href="https://arxiv.org/format/2411.07387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Isochrony-Controlled Speech-to-Text Translation: A study on translating from Sino-Tibetan to Indo-European Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yousefi%2C+M">Midia Yousefi</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yao Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junkun Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongmei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jian Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07387v1-abstract-short" style="display: inline;"> End-to-end speech translation (ST), which translates source language speech directly into target language text, has garnered significant attention in recent years. Many ST applications require strict length control to ensure that the translation duration matches the length of the source audio, including both speech and pause segments. Previous methods often controlled the number of words or charac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07387v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07387v1-abstract-full" style="display: none;"> End-to-end speech translation (ST), which translates source language speech directly into target language text, has garnered significant attention in recent years. Many ST applications require strict length control to ensure that the translation duration matches the length of the source audio, including both speech and pause segments. Previous methods often controlled the number of words or characters generated by the Machine Translation model to approximate the source sentence's length without considering the isochrony of pauses and speech segments, as duration can vary between languages. To address this, we present improvements to the duration alignment component of our sequence-to-sequence ST model. Our method controls translation length by predicting the duration of speech and pauses in conjunction with the translation process. This is achieved by providing timing information to the decoder, ensuring it tracks the remaining duration for speech and pauses while generating the translation. The evaluation on the Zh-En test set of CoVoST 2, demonstrates that the proposed Isochrony-Controlled ST achieves 0.92 speech overlap and 8.9 BLEU, which has only a 1.4 BLEU drop compared to the ST baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07387v1-abstract-full').style.display = 'none'; document.getElementById('2411.07387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07135">arXiv:2411.07135</a> <span> [<a href="https://arxiv.org/pdf/2411.07135">pdf</a>, <a href="https://arxiv.org/format/2411.07135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Edify 3D: Scalable High-Quality 3D Asset Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zekun Hao</a>, <a href="/search/cs?searchtype=author&query=Hasselgren%2C+J">Jon Hasselgren</a>, <a href="/search/cs?searchtype=author&query=Huffman%2C+J">Jacob Huffman</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jingyi Jin</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+J+P">J. P. Lewis</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chen-Hsuan Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Chen Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tsung-Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming-Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Alice Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Munkberg%2C+J">Jacob Munkberg</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Stella Shi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Fangyin Wei</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+D">Donglai Xiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiashu Xu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiaohui Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07135v1-abstract-short" style="display: inline;"> We introduce Edify 3D, an advanced solution designed for high-quality 3D asset generation. Our method first synthesizes RGB and surface normal images of the described object at multiple viewpoints using a diffusion model. The multi-view observations are then used to reconstruct the shape, texture, and PBR materials of the object. Our method can generate high-quality 3D assets with detailed geometr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07135v1-abstract-full" style="display: none;"> We introduce Edify 3D, an advanced solution designed for high-quality 3D asset generation. Our method first synthesizes RGB and surface normal images of the described object at multiple viewpoints using a diffusion model. The multi-view observations are then used to reconstruct the shape, texture, and PBR materials of the object. Our method can generate high-quality 3D assets with detailed geometry, clean shape topologies, high-resolution textures, and materials within 2 minutes of runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07135v1-abstract-full').style.display = 'none'; document.getElementById('2411.07135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://research.nvidia.com/labs/dir/edify-3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06895">arXiv:2411.06895</a> <span> [<a href="https://arxiv.org/pdf/2411.06895">pdf</a>, <a href="https://arxiv.org/format/2411.06895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> DynaShard: Secure and Adaptive Blockchain Sharding Protocol with Hybrid Consensus and Dynamic Shard Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+A">Ao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Kun He</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruiying Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahua Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cong Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yebo Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianfeng Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06895v1-abstract-short" style="display: inline;"> Blockchain sharding has emerged as a promising solution to the scalability challenges in traditional blockchain systems by partitioning the network into smaller, manageable subsets called shards. Despite its potential, existing sharding solutions face significant limitations in handling dynamic workloads, ensuring secure cross-shard transactions, and maintaining system integrity. To address these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06895v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06895v1-abstract-full" style="display: none;"> Blockchain sharding has emerged as a promising solution to the scalability challenges in traditional blockchain systems by partitioning the network into smaller, manageable subsets called shards. Despite its potential, existing sharding solutions face significant limitations in handling dynamic workloads, ensuring secure cross-shard transactions, and maintaining system integrity. To address these gaps, we propose DynaShard, a dynamic and secure cross-shard transaction processing mechanism designed to enhance blockchain sharding efficiency and security. DynaShard combines adaptive shard management, a hybrid consensus approach, plus an efficient state synchronization and dispute resolution protocol. Our performance evaluation, conducted using a robust experimental setup with real-world network conditions and transaction workloads, demonstrates DynaShard's superior throughput, reduced latency, and improved shard utilization compared to the FTBS method. Specifically, DynaShard achieves up to a 42.6% reduction in latency and a 78.77% improvement in shard utilization under high transaction volumes and varying cross-shard transaction ratios. These results highlight DynaShard's ability to outperform state-of-the-art sharding methods, ensuring scalable and resilient blockchain systems. We believe that DynaShard's innovative approach will significantly impact future developments in blockchain technology, paving the way for more efficient and secure distributed systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06895v1-abstract-full').style.display = 'none'; document.getElementById('2411.06895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06121">arXiv:2411.06121</a> <span> [<a href="https://arxiv.org/pdf/2411.06121">pdf</a>, <a href="https://arxiv.org/format/2411.06121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> SniffySquad: Patchiness-Aware Gas Source Localization with Multi-Robot Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuhan Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuecheng Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yixuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingao Xu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C">Chaopeng Hong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Susu Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao-Ping Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunhao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06121v1-abstract-short" style="display: inline;"> Gas source localization is pivotal for the rapid mitigation of gas leakage disasters, where mobile robots emerge as a promising solution. However, existing methods predominantly schedule robots' movements based on reactive stimuli or simplified gas plume models. These approaches typically excel in idealized, simulated environments but fall short in real-world gas environments characterized by thei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06121v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06121v1-abstract-full" style="display: none;"> Gas source localization is pivotal for the rapid mitigation of gas leakage disasters, where mobile robots emerge as a promising solution. However, existing methods predominantly schedule robots' movements based on reactive stimuli or simplified gas plume models. These approaches typically excel in idealized, simulated environments but fall short in real-world gas environments characterized by their patchy distribution. In this work, we introduce SniffySquad, a multi-robot olfaction-based system designed to address the inherent patchiness in gas source localization. SniffySquad incorporates a patchiness-aware active sensing approach that enhances the quality of data collection and estimation. Moreover, it features an innovative collaborative role adaptation strategy to boost the efficiency of source-seeking endeavors. Extensive evaluations demonstrate that our system achieves an increase in the success rate by $20\%+$ and an improvement in path efficiency by $30\%+$, outperforming state-of-the-art gas source localization solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06121v1-abstract-full').style.display = 'none'; document.getElementById('2411.06121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05212">arXiv:2411.05212</a> <span> [<a href="https://arxiv.org/pdf/2411.05212">pdf</a>, <a href="https://arxiv.org/format/2411.05212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RT-Grasp: Reasoning Tuning Robotic Grasping via Multi-modal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinxuan Xu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shiyu Jin</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yutian Lei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liangjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05212v1-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have showcased their remarkable reasoning capabilities, making them influential across various fields. However, in robotics, their use has primarily been limited to manipulation planning tasks due to their inherent textual output. This paper addresses this limitation by investigating the potential of adopting the reasoning ability of LLMs for generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05212v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05212v1-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have showcased their remarkable reasoning capabilities, making them influential across various fields. However, in robotics, their use has primarily been limited to manipulation planning tasks due to their inherent textual output. This paper addresses this limitation by investigating the potential of adopting the reasoning ability of LLMs for generating numerical predictions in robotics tasks, specifically for robotic grasping. We propose Reasoning Tuning, a novel method that integrates a reasoning phase before prediction during training, leveraging the extensive prior knowledge and advanced reasoning abilities of LLMs. This approach enables LLMs, notably with multi-modal capabilities, to generate accurate numerical outputs like grasp poses that are context-aware and adaptable through conversations. Additionally, we present the Reasoning Tuning VLM Grasp dataset, carefully curated to facilitate the adaptation of LLMs to robotic grasping. Extensive validation on both grasping datasets and real-world experiments underscores the adaptability of multi-modal LLMs for numerical prediction tasks in robotics. This not only expands their applicability but also bridges the gap between text-based planning and direct robot control, thereby maximizing the potential of LLMs in robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05212v1-abstract-full').style.display = 'none'; document.getElementById('2411.05212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IROS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05155">arXiv:2411.05155</a> <span> [<a href="https://arxiv.org/pdf/2411.05155">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> DynaPain: Moving Flame Beetle with Dynamic Pain Illusion Adapting Apparent Movement to Thermal Grill Illusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mizuno%2C+S">Souta Mizuno</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiayi Xu</a>, <a href="/search/cs?searchtype=author&query=Hasegawa%2C+S">Shoichi Hasegawa</a>, <a href="/search/cs?searchtype=author&query=Ienaga%2C+N">Naoto Ienaga</a>, <a href="/search/cs?searchtype=author&query=Kuroda%2C+Y">Yoshihiro Kuroda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05155v1-abstract-short" style="display: inline;"> Pain sensation presentation with movable sensory position is important to imitate the pain caused by objects in motion and the pain corresponding to a person's movements. We aimed at proposing a novel dynamic pain sensation experience, called DynaPain. DynaPain was achieved by the non-contact thermal grill illusion and the apparent movement. The demonstration provided the dynamic heat and pain exp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05155v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05155v1-abstract-full" style="display: none;"> Pain sensation presentation with movable sensory position is important to imitate the pain caused by objects in motion and the pain corresponding to a person's movements. We aimed at proposing a novel dynamic pain sensation experience, called DynaPain. DynaPain was achieved by the non-contact thermal grill illusion and the apparent movement. The demonstration provided the dynamic heat and pain experience through interaction with a flame beetle moving on the arm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05155v1-abstract-full').style.display = 'none'; document.getElementById('2411.05155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of proceedings of 6th International Conference AsiaHaptics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05103">arXiv:2411.05103</a> <span> [<a href="https://arxiv.org/pdf/2411.05103">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> MoHeat: A Modular Platform for High-Responsive Non-Contact Thermal Feedback Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiayi Xu</a>, <a href="/search/cs?searchtype=author&query=Nakamura%2C+K">Kazuma Nakamura</a>, <a href="/search/cs?searchtype=author&query=Kuroda%2C+Y">Yoshihiro Kuroda</a>, <a href="/search/cs?searchtype=author&query=Inami%2C+M">Masahiko Inami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05103v1-abstract-short" style="display: inline;"> MoHeat is a modular hardware and software platform designed for rapid prototyping of highly responsive, non-contact thermal feedback interactions. In our previous work, we developed an intensity-adjustable, highly responsive, non-contact thermal feedback system by integrating the vortex effect and thermal radiation. In this study, we further enhanced the system by developing an authoring tool that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05103v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05103v1-abstract-full" style="display: none;"> MoHeat is a modular hardware and software platform designed for rapid prototyping of highly responsive, non-contact thermal feedback interactions. In our previous work, we developed an intensity-adjustable, highly responsive, non-contact thermal feedback system by integrating the vortex effect and thermal radiation. In this study, we further enhanced the system by developing an authoring tool that allows users to freely adjust the intensity of thermal stimuli, the duration of stimuli, the delay time before stimuli, and the interval between alternating hot and cold stimuli. This modular approach enables countless combinations of non-contact thermal feedback experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05103v1-abstract-full').style.display = 'none'; document.getElementById('2411.05103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of proceedings of 6th International Conference AsiaHaptics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05026">arXiv:2411.05026</a> <span> [<a href="https://arxiv.org/pdf/2411.05026">pdf</a>, <a href="https://arxiv.org/ps/2411.05026">ps</a>, <a href="https://arxiv.org/format/2411.05026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning and Machine Learning -- Natural Language Processing: From Theory to Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyu Chen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+C">Cheng Fei</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Ziqian Bi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junyu Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Benji Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sen Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xuanhe Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiawei Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinlang Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+C+H">Caitlyn Heqi Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pohsun Feng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yizhu Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jintao Ren</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Q">Qian Niu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Silin Chen</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+W">Weiche Hsieh</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L+K+Q">Lawrence K. Q. Yan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C+X">Chia Xin Liang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Han Xu</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+H">Hong-Ming Tseng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinyuan Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05026v1-abstract-short" style="display: inline;"> With a focus on natural language processing (NLP) and the role of large language models (LLMs), we explore the intersection of machine learning, deep learning, and artificial intelligence. As artificial intelligence continues to revolutionize fields from healthcare to finance, NLP techniques such as tokenization, text classification, and entity recognition are essential for processing and understa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05026v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05026v1-abstract-full" style="display: none;"> With a focus on natural language processing (NLP) and the role of large language models (LLMs), we explore the intersection of machine learning, deep learning, and artificial intelligence. As artificial intelligence continues to revolutionize fields from healthcare to finance, NLP techniques such as tokenization, text classification, and entity recognition are essential for processing and understanding human language. This paper discusses advanced data preprocessing techniques and the use of frameworks like Hugging Face for implementing transformer-based models. Additionally, it highlights challenges such as handling multilingual data, reducing bias, and ensuring model robustness. By addressing key aspects of data processing and model fine-tuning, this work aims to provide insights into deploying effective and ethically sound AI solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05026v1-abstract-full').style.display = 'none'; document.getElementById('2411.05026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">255 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04954">arXiv:2411.04954</a> <span> [<a href="https://arxiv.org/pdf/2411.04954">pdf</a>, <a href="https://arxiv.org/format/2411.04954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CAD-MLLM: Unifying Multimodality-Conditioned CAD Generation With MLLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wen Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shenghua Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04954v1-abstract-short" style="display: inline;"> This paper aims to design a unified Computer-Aided Design (CAD) generation system that can easily generate CAD models based on the user's inputs in the form of textual description, images, point clouds, or even a combination of them. Towards this goal, we introduce the CAD-MLLM, the first system capable of generating parametric CAD models conditioned on the multimodal input. Specifically, within t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04954v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04954v1-abstract-full" style="display: none;"> This paper aims to design a unified Computer-Aided Design (CAD) generation system that can easily generate CAD models based on the user's inputs in the form of textual description, images, point clouds, or even a combination of them. Towards this goal, we introduce the CAD-MLLM, the first system capable of generating parametric CAD models conditioned on the multimodal input. Specifically, within the CAD-MLLM framework, we leverage the command sequences of CAD models and then employ advanced large language models (LLMs) to align the feature space across these diverse multi-modalities data and CAD models' vectorized representations. To facilitate the model training, we design a comprehensive data construction and annotation pipeline that equips each CAD model with corresponding multimodal data. Our resulting dataset, named Omni-CAD, is the first multimodal CAD dataset that contains textual description, multi-view images, points, and command sequence for each CAD model. It contains approximately 450K instances and their CAD construction sequences. To thoroughly evaluate the quality of our generated CAD models, we go beyond current evaluation metrics that focus on reconstruction quality by introducing additional metrics that assess topology quality and surface enclosure extent. Extensive experimental results demonstrate that CAD-MLLM significantly outperforms existing conditional generative methods and remains highly robust to noises and missing points. The project page and more visualizations can be found at: https://cad-mllm.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04954v1-abstract-full').style.display = 'none'; document.getElementById('2411.04954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://cad-mllm.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04422">arXiv:2411.04422</a> <span> [<a href="https://arxiv.org/pdf/2411.04422">pdf</a>, <a href="https://arxiv.org/format/2411.04422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Abnormal Stop Detection for Long Distance Coaches with Low-Frequency GPS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiaxin Deng</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Junbiao Pang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiayu Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haitao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04422v1-abstract-short" style="display: inline;"> In our urban life, long distance coaches supply a convenient yet economic approach to the transportation of the public. One notable problem is to discover the abnormal stop of the coaches due to the important reason, i.e., illegal pick up on the way which possibly endangers the safety of passengers. It has become a pressing issue to detect the coach abnormal stop with low-quality GPS. In this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04422v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04422v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04422v1-abstract-full" style="display: none;"> In our urban life, long distance coaches supply a convenient yet economic approach to the transportation of the public. One notable problem is to discover the abnormal stop of the coaches due to the important reason, i.e., illegal pick up on the way which possibly endangers the safety of passengers. It has become a pressing issue to detect the coach abnormal stop with low-quality GPS. In this paper, we propose an unsupervised method that helps transportation managers to efficiently discover the Abnormal Stop Detection (ASD) for long distance coaches. Concretely, our method converts the ASD problem into an unsupervised clustering framework in which both the normal stop and the abnormal one are decomposed. Firstly, we propose a stop duration model for the low frequency GPS based on the assumption that a coach changes speed approximately in a linear approach. Secondly, we strip the abnormal stops from the normal stop points by the low rank assumption. The proposed method is conceptually simple yet efficient, by leveraging low rank assumption to handle normal stop points, our approach enables domain experts to discover the ASD for coaches, from a case study motivated by traffic managers. Datset and code are publicly available at: https://github.com/pangjunbiao/IPPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04422v1-abstract-full').style.display = 'none'; document.getElementById('2411.04422v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04109">arXiv:2411.04109</a> <span> [<a href="https://arxiv.org/pdf/2411.04109">pdf</a>, <a href="https://arxiv.org/format/2411.04109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Consistency Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prasad%2C+A">Archiki Prasad</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jane Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04109v2-abstract-short" style="display: inline;"> Self-alignment, whereby models learn to improve themselves without human annotation, is a rapidly growing research area. However, existing techniques often fail to improve complex reasoning tasks due to the difficulty of assigning correct rewards. An orthogonal approach that is known to improve correctness is self-consistency, a method applied at inference time based on multiple sampling in order… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04109v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04109v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04109v2-abstract-full" style="display: none;"> Self-alignment, whereby models learn to improve themselves without human annotation, is a rapidly growing research area. However, existing techniques often fail to improve complex reasoning tasks due to the difficulty of assigning correct rewards. An orthogonal approach that is known to improve correctness is self-consistency, a method applied at inference time based on multiple sampling in order to find the most consistent answer. In this work, we extend the self-consistency concept to help train models. We thus introduce self-consistency preference optimization (ScPO), which iteratively trains consistent answers to be preferred over inconsistent ones on unsupervised new problems. We show ScPO leads to large improvements over conventional reward model training on reasoning tasks such as GSM8K and MATH, closing the gap with supervised training with gold answers or preferences, and that combining ScPO with standard supervised learning improves results even further. On ZebraLogic, ScPO finetunes Llama-3 8B to be superior to Llama-3 70B, Gemma-2 27B, and Claude-3 Haiku. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04109v2-abstract-full').style.display = 'none'; document.getElementById('2411.04109v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03663">arXiv:2411.03663</a> <span> [<a href="https://arxiv.org/pdf/2411.03663">pdf</a>, <a href="https://arxiv.org/format/2411.03663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Can Graph Neural Networks Expose Training Data Properties? An Efficient Risk Assessment Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hanyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiarong Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Renhong Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunping Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03663v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have attracted considerable attention due to their diverse applications. However, the scarcity and quality limitations of graph data present challenges to their training process in practical settings. To facilitate the development of effective GNNs, companies and researchers often seek external collaboration. Yet, directly sharing data raises privacy concerns, motivati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03663v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03663v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have attracted considerable attention due to their diverse applications. However, the scarcity and quality limitations of graph data present challenges to their training process in practical settings. To facilitate the development of effective GNNs, companies and researchers often seek external collaboration. Yet, directly sharing data raises privacy concerns, motivating data owners to train GNNs on their private graphs and share the trained models. Unfortunately, these models may still inadvertently disclose sensitive properties of their training graphs (e.g., average default rate in a transaction network), leading to severe consequences for data owners. In this work, we study graph property inference attack to identify the risk of sensitive property information leakage from shared models. Existing approaches typically train numerous shadow models for developing such attack, which is computationally intensive and impractical. To address this issue, we propose an efficient graph property inference attack by leveraging model approximation techniques. Our method only requires training a small set of models on graphs, while generating a sufficient number of approximated shadow models for attacks. To enhance diversity while reducing errors in the approximated models, we apply edit distance to quantify the diversity within a group of approximated models and introduce a theoretically guaranteed criterion to evaluate each model's error. Subsequently, we propose a novel selection mechanism to ensure that the retained approximated models achieve high diversity and low error. Extensive experiments across six real-world scenarios demonstrate our method's substantial improvement, with average increases of 2.7% in attack accuracy and 4.1% in ROC-AUC, while being 6.5$\times$ faster compared to the best baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03663v1-abstract-full').style.display = 'none'; document.getElementById('2411.03663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In NeurIPS'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03638">arXiv:2411.03638</a> <span> [<a href="https://arxiv.org/pdf/2411.03638">pdf</a>, <a href="https://arxiv.org/format/2411.03638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Stereo Depth Estimation with Multi-Spectral Images Across All Lighting Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zihan Qin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jialei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junjun Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xianming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03638v1-abstract-short" style="display: inline;"> Depth estimation under adverse conditions remains a significant challenge. Recently, multi-spectral depth estimation, which integrates both visible light and thermal images, has shown promise in addressing this issue. However, existing algorithms struggle with precise pixel-level feature matching, limiting their ability to fully exploit geometric constraints across different spectra. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03638v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03638v1-abstract-full" style="display: none;"> Depth estimation under adverse conditions remains a significant challenge. Recently, multi-spectral depth estimation, which integrates both visible light and thermal images, has shown promise in addressing this issue. However, existing algorithms struggle with precise pixel-level feature matching, limiting their ability to fully exploit geometric constraints across different spectra. To address this, we propose a novel framework incorporating stereo depth estimation to enforce accurate geometric constraints. In particular, we treat the visible light and thermal images as a stereo pair and utilize a Cross-modal Feature Matching (CFM) Module to construct a cost volume for pixel-level matching. To mitigate the effects of poor lighting on stereo matching, we introduce Degradation Masking, which leverages robust monocular thermal depth estimation in degraded regions. Our method achieves state-of-the-art (SOTA) performance on the Multi-Spectral Stereo (MS2) dataset, with qualitative evaluations demonstrating high-quality depth maps under varying lighting conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03638v1-abstract-full').style.display = 'none'; document.getElementById('2411.03638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03350">arXiv:2411.03350</a> <span> [<a href="https://arxiv.org/pdf/2411.03350">pdf</a>, <a href="https://arxiv.org/format/2411.03350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey of Small Language Models in the Era of Large Language Models: Techniques, Enhancements, Applications, Collaboration with LLMs, and Trustworthiness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fali Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xianren Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongyu Wu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tzuhao Mo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qiuhao Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wanjing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Junjie Xu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qi He</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yao Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Ming Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03350v1-abstract-short" style="display: inline;"> Large language models (LLM) have demonstrated emergent abilities in text generation, question answering, and reasoning, facilitating various tasks and domains. Despite their proficiency in various tasks, LLMs like LaPM 540B and Llama-3.1 405B face limitations due to large parameter sizes and computational demands, often requiring cloud API use which raises privacy concerns, limits real-time applic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03350v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03350v1-abstract-full" style="display: none;"> Large language models (LLM) have demonstrated emergent abilities in text generation, question answering, and reasoning, facilitating various tasks and domains. Despite their proficiency in various tasks, LLMs like LaPM 540B and Llama-3.1 405B face limitations due to large parameter sizes and computational demands, often requiring cloud API use which raises privacy concerns, limits real-time applications on edge devices, and increases fine-tuning costs. Additionally, LLMs often underperform in specialized domains such as healthcare and law due to insufficient domain-specific knowledge, necessitating specialized models. Therefore, Small Language Models (SLMs) are increasingly favored for their low inference latency, cost-effectiveness, efficient development, and easy customization and adaptability. These models are particularly well-suited for resource-limited environments and domain knowledge acquisition, addressing LLMs' challenges and proving ideal for applications that require localized data handling for privacy, minimal inference latency for efficiency, and domain knowledge acquisition through lightweight fine-tuning. The rising demand for SLMs has spurred extensive research and development. However, a comprehensive survey investigating issues related to the definition, acquisition, application, enhancement, and reliability of SLM remains lacking, prompting us to conduct a detailed survey on these topics. The definition of SLMs varies widely, thus to standardize, we propose defining SLMs by their capability to perform specialized tasks and suitability for resource-constrained settings, setting boundaries based on the minimal size for emergent abilities and the maximum size sustainable under resource constraints. For other aspects, we provide a taxonomy of relevant models/methods and develop general frameworks for each category to enhance and utilize SLMs effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03350v1-abstract-full').style.display = 'none'; document.getElementById('2411.03350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">76 pages, 26 figures, 14 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 (Primary) 68T07 (Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03255">arXiv:2411.03255</a> <span> [<a href="https://arxiv.org/pdf/2411.03255">pdf</a>, <a href="https://arxiv.org/format/2411.03255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Error Interference in Quantum Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyang Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jue Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiao Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03255v2-abstract-short" style="display: inline;"> Understanding algorithmic error accumulation in quantum simulation is crucial due to its fundamental significance and practical applications in simulating quantum many-body system dynamics. Conventional theories typically apply the triangle inequality to provide an upper bound for the error. However, these often yield overly conservative and inaccurate estimates as they neglect error interference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03255v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03255v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03255v2-abstract-full" style="display: none;"> Understanding algorithmic error accumulation in quantum simulation is crucial due to its fundamental significance and practical applications in simulating quantum many-body system dynamics. Conventional theories typically apply the triangle inequality to provide an upper bound for the error. However, these often yield overly conservative and inaccurate estimates as they neglect error interference -- a phenomenon where errors in different segments can destructively interfere. Here, we introduce a novel method that directly estimates the long-time algorithmic errors with multiple segments, thereby establishing a comprehensive framework for characterizing algorithmic error interference. We identify the sufficient and necessary condition for strict error interference and introduce the concept of approximate error interference, which is more broadly applicable to scenarios such as power-law interaction models, the Fermi-Hubbard model, and higher-order Trotter formulas. Our work demonstrates significant improvements over prior ones and opens new avenues for error analysis in quantum simulation, offering potential advancements in both theoretical algorithm design and experimental implementation of Hamiltonian simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03255v2-abstract-full').style.display = 'none'; document.getElementById('2411.03255v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02848">arXiv:2411.02848</a> <span> [<a href="https://arxiv.org/pdf/2411.02848">pdf</a>, <a href="https://arxiv.org/format/2411.02848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1121/10.0026598">10.1121/10.0026598 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Adversarial multi-task underwater acoustic target recognition: towards robustness against various influential factors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuan Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Ji Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jiawei Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junfeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02848v1-abstract-short" style="display: inline;"> Underwater acoustic target recognition based on passive sonar faces numerous challenges in practical maritime applications. One of the main challenges lies in the susceptibility of signal characteristics to diverse environmental conditions and data acquisition configurations, which can lead to instability in recognition systems. While significant efforts have been dedicated to addressing these inf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02848v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02848v1-abstract-full" style="display: none;"> Underwater acoustic target recognition based on passive sonar faces numerous challenges in practical maritime applications. One of the main challenges lies in the susceptibility of signal characteristics to diverse environmental conditions and data acquisition configurations, which can lead to instability in recognition systems. While significant efforts have been dedicated to addressing these influential factors in other domains of underwater acoustics, they are often neglected in the field of underwater acoustic target recognition. To overcome this limitation, this study designs auxiliary tasks that model influential factors (e.g., source range, water column depth, or wind speed) based on available annotations and adopts a multi-task framework to connect these factors to the recognition task. Furthermore, we integrate an adversarial learning mechanism into the multi-task framework to prompt the model to extract representations that are robust against influential factors. Through extensive experiments and analyses on the ShipsEar dataset, our proposed adversarial multi-task model demonstrates its capacity to effectively model the influential factors and achieve state-of-the-art performance on the 12-class recognition task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02848v1-abstract-full').style.display = 'none'; document.getElementById('2411.02848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02787">arXiv:2411.02787</a> <span> [<a href="https://arxiv.org/pdf/2411.02787">pdf</a>, <a href="https://arxiv.org/format/2411.02787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1121/10.0026481">10.1121/10.0026481 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Advancing Robust Underwater Acoustic Target Recognition through Multi-task Learning and Multi-Gate Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuan Xie</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jiawei Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junfeng Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Ji Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02787v1-abstract-short" style="display: inline;"> Underwater acoustic target recognition has emerged as a prominent research area within the field of underwater acoustics. However, the current availability of authentic underwater acoustic signal recordings remains limited, which hinders data-driven acoustic recognition models from learning robust patterns of targets from a limited set of intricate underwater signals, thereby compromising their st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02787v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02787v1-abstract-full" style="display: none;"> Underwater acoustic target recognition has emerged as a prominent research area within the field of underwater acoustics. However, the current availability of authentic underwater acoustic signal recordings remains limited, which hinders data-driven acoustic recognition models from learning robust patterns of targets from a limited set of intricate underwater signals, thereby compromising their stability in practical applications. To overcome these limitations, this study proposes a recognition framework called M3 (Multi-task, Multi-gate, Multi-expert) to enhance the model's ability to capture robust patterns by making it aware of the inherent properties of targets. In this framework, an auxiliary task that focuses on target properties, such as estimating target size, is designed. The auxiliary task then shares parameters with the recognition task to realize multi-task learning. This paradigm allows the model to concentrate on shared information across tasks and identify robust patterns of targets in a regularized manner, thereby enhancing the model's generalization ability. Moreover, M3 incorporates multi-expert and multi-gate mechanisms, allowing for the allocation of distinct parameter spaces to various underwater signals. This enables the model to process intricate signal patterns in a fine-grained and differentiated manner. To evaluate the effectiveness of M3, extensive experiments were implemented on the ShipsEar underwater ship-radiated noise dataset. The results substantiate that M3 has the ability to outperform the most advanced single-task recognition models, thereby achieving the state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02787v1-abstract-full').style.display = 'none'; document.getElementById('2411.02787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02758">arXiv:2411.02758</a> <span> [<a href="https://arxiv.org/pdf/2411.02758">pdf</a>, <a href="https://arxiv.org/format/2411.02758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DEMONet: Underwater Acoustic Target Recognition based on Multi-Expert Network and Cross-Temporal Variational Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuan Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jiawei Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Ji Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02758v1-abstract-short" style="display: inline;"> Building a robust underwater acoustic recognition system in real-world scenarios is challenging due to the complex underwater environment and the dynamic motion states of targets. A promising optimization approach is to leverage the intrinsic physical characteristics of targets, which remain invariable regardless of environmental conditions, to provide robust insights. However, our study reveals t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02758v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02758v1-abstract-full" style="display: none;"> Building a robust underwater acoustic recognition system in real-world scenarios is challenging due to the complex underwater environment and the dynamic motion states of targets. A promising optimization approach is to leverage the intrinsic physical characteristics of targets, which remain invariable regardless of environmental conditions, to provide robust insights. However, our study reveals that while physical characteristics exhibit robust properties, they may lack class-specific discriminative patterns. Consequently, directly incorporating physical characteristics into model training can potentially introduce unintended inductive biases, leading to performance degradation. To utilize the benefits of physical characteristics while mitigating possible detrimental effects, we propose DEMONet in this study, which utilizes the detection of envelope modulation on noise (DEMON) to provide robust insights into the shaft frequency or blade counts of targets. DEMONet is a multi-expert network that allocates various underwater signals to their best-matched expert layer based on DEMON spectra for fine-grained signal processing. Thereinto, DEMON spectra are solely responsible for providing implicit physical characteristics without establishing a mapping relationship with the target category. Furthermore, to mitigate noise and spurious modulation spectra in DEMON features, we introduce a cross-temporal alignment strategy and employ a variational autoencoder (VAE) to reconstruct noise-resistant DEMON spectra to replace the raw DEMON features. The effectiveness of the proposed DEMONet with cross-temporal VAE was primarily evaluated on the DeepShip dataset and our proprietary datasets. Experimental results demonstrated that our approach could achieve state-of-the-art performance on both datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02758v1-abstract-full').style.display = 'none'; document.getElementById('2411.02758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02395">arXiv:2411.02395</a> <span> [<a href="https://arxiv.org/pdf/2411.02395">pdf</a>, <a href="https://arxiv.org/format/2411.02395">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Training-free Regional Prompting for Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anthony Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jianjin Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Gaole Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yida Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02395v1-abstract-short" style="display: inline;"> Diffusion models have demonstrated excellent capabilities in text-to-image generation. Their semantic understanding (i.e., prompt following) ability has also been greatly improved with large language models (e.g., T5, Llama). However, existing models cannot perfectly handle long and complex text prompts, especially when the text prompts contain various objects with numerous attributes and interrel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02395v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02395v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02395v1-abstract-full" style="display: none;"> Diffusion models have demonstrated excellent capabilities in text-to-image generation. Their semantic understanding (i.e., prompt following) ability has also been greatly improved with large language models (e.g., T5, Llama). However, existing models cannot perfectly handle long and complex text prompts, especially when the text prompts contain various objects with numerous attributes and interrelated spatial relationships. While many regional prompting methods have been proposed for UNet-based models (SD1.5, SDXL), but there are still no implementations based on the recent Diffusion Transformer (DiT) architecture, such as SD3 and FLUX.1.In this report, we propose and implement regional prompting for FLUX.1 based on attention manipulation, which enables DiT with fined-grained compositional text-to-image generation capability in a training-free manner. Code is available at https://github.com/antonioo-c/Regional-Prompting-FLUX. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02395v1-abstract-full').style.display = 'none'; document.getElementById('2411.02395v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/antonioo-c/Regional-Prompting-FLUX</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01780">arXiv:2411.01780</a> <span> [<a href="https://arxiv.org/pdf/2411.01780">pdf</a>, <a href="https://arxiv.org/format/2411.01780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Clustering Based on Density Propagation and Subcluster Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+F">Feiping Nie</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yitao Song</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jingjing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01780v1-abstract-short" style="display: inline;"> We propose the DPSM method, a density-based node clustering approach that automatically determines the number of clusters and can be applied in both data space and graph space. Unlike traditional density-based clustering methods, which necessitate calculating the distance between any two nodes, our proposed technique determines density through a propagation process, thereby making it suitable for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01780v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01780v1-abstract-full" style="display: none;"> We propose the DPSM method, a density-based node clustering approach that automatically determines the number of clusters and can be applied in both data space and graph space. Unlike traditional density-based clustering methods, which necessitate calculating the distance between any two nodes, our proposed technique determines density through a propagation process, thereby making it suitable for a graph space. In DPSM, nodes are partitioned into small clusters based on propagated density. The partitioning technique has been proved to be sound and complete. We then extend the concept of spectral clustering from individual nodes to these small clusters, while introducing the CluCut measure to guide cluster merging. This measure is modified in various ways to account for cluster properties, thus provides guidance on when to terminate the merging process. Various experiments have validated the effectiveness of DOSM and the accuracy of these conclusions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01780v1-abstract-full').style.display = 'none'; document.getElementById('2411.01780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01564">arXiv:2411.01564</a> <span> [<a href="https://arxiv.org/pdf/2411.01564">pdf</a>, <a href="https://arxiv.org/format/2411.01564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ParseCaps: An Interpretable Parsing Capsule Network for Medical Image Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xinyu Geng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaming Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01564v1-abstract-short" style="display: inline;"> Deep learning has excelled in medical image classification, but its clinical application is limited by poor interpretability. Capsule networks, known for encoding hierarchical relationships and spatial features, show potential in addressing this issue. Nevertheless, traditional capsule networks often underperform due to their shallow structures, and deeper variants lack hierarchical architectures,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01564v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01564v1-abstract-full" style="display: none;"> Deep learning has excelled in medical image classification, but its clinical application is limited by poor interpretability. Capsule networks, known for encoding hierarchical relationships and spatial features, show potential in addressing this issue. Nevertheless, traditional capsule networks often underperform due to their shallow structures, and deeper variants lack hierarchical architectures, thereby compromising interpretability. This paper introduces a novel capsule network, ParseCaps, which utilizes the sparse axial attention routing and parse convolutional capsule layer to form a parse-tree-like structure, enhancing both depth and interpretability. Firstly, sparse axial attention routing optimizes connections between child and parent capsules, as well as emphasizes the weight distribution across instantiation parameters of parent capsules. Secondly, the parse convolutional capsule layer generates capsule predictions aligning with the parse tree. Finally, based on the loss design that is effective whether concept ground truth exists or not, ParseCaps advances interpretability by associating each dimension of the global capsule with a comprehensible concept, thereby facilitating clinician trust and understanding of the model's classification results. Experimental results on CE-MRI, PH$^2$, and Derm7pt datasets show that ParseCaps not only outperforms other capsule network variants in classification accuracy, redundancy reduction and robustness, but also provides interpretable explanations, regardless of the availability of concept labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01564v1-abstract-full').style.display = 'none'; document.getElementById('2411.01564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01419">arXiv:2411.01419</a> <span> [<a href="https://arxiv.org/pdf/2411.01419">pdf</a>, <a href="https://arxiv.org/format/2411.01419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PSformer: Parameter-efficient Transformer with Segment Attention for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanlong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shao-Lun Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D+D">Danny Dongning Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao-Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01419v1-abstract-short" style="display: inline;"> Time series forecasting remains a critical challenge across various domains, often complicated by high-dimensional data and long-term dependencies. This paper presents a novel transformer architecture for time series forecasting, incorporating two key innovations: parameter sharing (PS) and Spatial-Temporal Segment Attention (SegAtt). We also define the time series segment as the concatenation of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01419v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01419v1-abstract-full" style="display: none;"> Time series forecasting remains a critical challenge across various domains, often complicated by high-dimensional data and long-term dependencies. This paper presents a novel transformer architecture for time series forecasting, incorporating two key innovations: parameter sharing (PS) and Spatial-Temporal Segment Attention (SegAtt). We also define the time series segment as the concatenation of sequence patches from the same positions across different variables. The proposed model, PSformer, reduces the number of training parameters through the parameter sharing mechanism, thereby improving model efficiency and scalability. The introduction of SegAtt could enhance the capability of capturing local spatio-temporal dependencies by computing attention over the segments, and improve global representation by integrating information across segments. The combination of parameter sharing and SegAtt significantly improves the forecasting performance. Extensive experiments on benchmark datasets demonstrate that PSformer outperforms popular baselines and other transformer-based approaches in terms of accuracy and scalability, establishing itself as an accurate and scalable tool for time series forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01419v1-abstract-full').style.display = 'none'; document.getElementById('2411.01419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01408">arXiv:2411.01408</a> <span> [<a href="https://arxiv.org/pdf/2411.01408">pdf</a>, <a href="https://arxiv.org/format/2411.01408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HeightMapNet: Explicit Height Modeling for End-to-End HD Map Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+W">Wenzhao Qiu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+S">Shanmin Pang</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+H">Hao zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jianwu Fang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jianru Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01408v1-abstract-short" style="display: inline;"> Recent advances in high-definition (HD) map construction from surround-view images have highlighted their cost-effectiveness in deployment. However, prevailing techniques often fall short in accurately extracting and utilizing road features, as well as in the implementation of view transformation. In response, we introduce HeightMapNet, a novel framework that establishes a dynamic relationship bet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01408v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01408v1-abstract-full" style="display: none;"> Recent advances in high-definition (HD) map construction from surround-view images have highlighted their cost-effectiveness in deployment. However, prevailing techniques often fall short in accurately extracting and utilizing road features, as well as in the implementation of view transformation. In response, we introduce HeightMapNet, a novel framework that establishes a dynamic relationship between image features and road surface height distributions. By integrating height priors, our approach refines the accuracy of Bird's-Eye-View (BEV) features beyond conventional methods. HeightMapNet also introduces a foreground-background separation network that sharply distinguishes between critical road elements and extraneous background components, enabling precise focus on detailed road micro-features. Additionally, our method leverages multi-scale features within the BEV space, optimally utilizing spatial geometric information to boost model performance. HeightMapNet has shown exceptional results on the challenging nuScenes and Argoverse 2 datasets, outperforming several widely recognized approaches. The code will be available at \url{https://github.com/adasfag/HeightMapNet/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01408v1-abstract-full').style.display = 'none'; document.getElementById('2411.01408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted to WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01136">arXiv:2411.01136</a> <span> [<a href="https://arxiv.org/pdf/2411.01136">pdf</a>, <a href="https://arxiv.org/format/2411.01136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Do LLMs Know to Respect Copyright Notice? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jialiang Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shenglan Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaozhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Denghui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01136v1-abstract-short" style="display: inline;"> Prior study shows that LLMs sometimes generate content that violates copyright. In this paper, we study another important yet underexplored problem, i.e., will LLMs respect copyright information in user input, and behave accordingly? The research problem is critical, as a negative answer would imply that LLMs will become the primary facilitator and accelerator of copyright infringement behavior. W… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01136v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01136v1-abstract-full" style="display: none;"> Prior study shows that LLMs sometimes generate content that violates copyright. In this paper, we study another important yet underexplored problem, i.e., will LLMs respect copyright information in user input, and behave accordingly? The research problem is critical, as a negative answer would imply that LLMs will become the primary facilitator and accelerator of copyright infringement behavior. We conducted a series of experiments using a diverse set of language models, user prompts, and copyrighted materials, including books, news articles, API documentation, and movie scripts. Our study offers a conservative evaluation of the extent to which language models may infringe upon copyrights when processing user input containing protected material. This research emphasizes the need for further investigation and the importance of ensuring LLMs respect copyright regulations when handling user input to prevent unauthorized use or reproduction of protected content. We also release a benchmark dataset serving as a test bed for evaluating infringement behaviors by LLMs and stress the need for future alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01136v1-abstract-full').style.display = 'none'; document.getElementById('2411.01136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01040">arXiv:2411.01040</a> <span> [<a href="https://arxiv.org/pdf/2411.01040">pdf</a>, <a href="https://arxiv.org/format/2411.01040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Identify Backdoored Model in Federated Learning via Individual Unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zikai Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01040v1-abstract-short" style="display: inline;"> Backdoor attacks present a significant threat to the robustness of Federated Learning (FL) due to their stealth and effectiveness. They maintain both the main task of the FL system and the backdoor task simultaneously, causing malicious models to appear statistically similar to benign ones, which enables them to evade detection by existing defense methods. We find that malicious parameters in back… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01040v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01040v1-abstract-full" style="display: none;"> Backdoor attacks present a significant threat to the robustness of Federated Learning (FL) due to their stealth and effectiveness. They maintain both the main task of the FL system and the backdoor task simultaneously, causing malicious models to appear statistically similar to benign ones, which enables them to evade detection by existing defense methods. We find that malicious parameters in backdoored models are inactive on the main task, resulting in a significantly large empirical loss during the machine unlearning process on clean inputs. Inspired by this, we propose MASA, a method that utilizes individual unlearning on local models to identify malicious models in FL. To improve the performance of MASA in challenging non-independent and identically distributed (non-IID) settings, we design pre-unlearning model fusion that integrates local models with knowledge learned from other datasets to mitigate the divergence in their unlearning behaviors caused by the non-IID data distributions of clients. Additionally, we propose a new anomaly detection metric with minimal hyperparameters to filter out malicious models efficiently. Extensive experiments on IID and non-IID datasets across six different attacks validate the effectiveness of MASA. To the best of our knowledge, this is the first work to leverage machine unlearning to identify malicious models in FL. Code is available at \url{https://github.com/JiiahaoXU/MASA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01040v1-abstract-full').style.display = 'none'; document.getElementById('2411.01040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00965">arXiv:2411.00965</a> <span> [<a href="https://arxiv.org/pdf/2411.00965">pdf</a>, <a href="https://arxiv.org/format/2411.00965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SPOT: SE(3) Pose Trajectory Diffusion for Object-Centric Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Cheng-Chun Hsu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bowen Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+Y">Yashraj Narang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Biswas%2C+J">Joydeep Biswas</a>, <a href="/search/cs?searchtype=author&query=Birchfield%2C+S">Stan Birchfield</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00965v1-abstract-short" style="display: inline;"> We introduce SPOT, an object-centric imitation learning framework. The key idea is to capture each task by an object-centric representation, specifically the SE(3) object pose trajectory relative to the target. This approach decouples embodiment actions from sensory inputs, facilitating learning from various demonstration types, including both action-based and action-less human hand demonstrations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00965v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00965v1-abstract-full" style="display: none;"> We introduce SPOT, an object-centric imitation learning framework. The key idea is to capture each task by an object-centric representation, specifically the SE(3) object pose trajectory relative to the target. This approach decouples embodiment actions from sensory inputs, facilitating learning from various demonstration types, including both action-based and action-less human hand demonstrations, as well as cross-embodiment generalization. Additionally, object pose trajectories inherently capture planning constraints from demonstrations without the need for manually crafted rules. To guide the robot in executing the task, the object trajectory is used to condition a diffusion policy. We show improvement compared to prior work on RLBench simulated tasks. In real-world evaluation, using only eight demonstrations shot on an iPhone, our approach completed all tasks while fully complying with task constraints. Project page: https://nvlabs.github.io/object_centric_diffusion <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00965v1-abstract-full').style.display = 'none'; document.getElementById('2411.00965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00848">arXiv:2411.00848</a> <span> [<a href="https://arxiv.org/pdf/2411.00848">pdf</a>, <a href="https://arxiv.org/format/2411.00848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Evidential Reliability In Pattern Recognition Based On Intuitionistic Fuzzy Sets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Juntao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+T">Tianxiang Zhan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yong Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00848v1-abstract-short" style="display: inline;"> Determining the reliability of evidence sources is a crucial topic in Dempster-Shafer theory (DST). Previous approaches have addressed high conflicts between evidence sources using discounting methods, but these methods may not ensure the high efficiency of classification models. In this paper, we consider the combination of DS theory and Intuitionistic Fuzzy Sets (IFS) and propose an algorithm fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00848v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00848v1-abstract-full" style="display: none;"> Determining the reliability of evidence sources is a crucial topic in Dempster-Shafer theory (DST). Previous approaches have addressed high conflicts between evidence sources using discounting methods, but these methods may not ensure the high efficiency of classification models. In this paper, we consider the combination of DS theory and Intuitionistic Fuzzy Sets (IFS) and propose an algorithm for quantifying the reliability of evidence sources, called Fuzzy Reliability Index (FRI). The FRI algorithm is based on decision quantification rules derived from IFS, defining the contribution of different BPAs to correct decisions and deriving the evidential reliability from these contributions. The proposed method effectively enhances the rationality of reliability estimation for evidence sources, making it particularly suitable for classification decision problems in complex scenarios. Subsequent comparisons with DST-based algorithms and classical machine learning algorithms demonstrate the superiority and generalizability of the FRI algorithm. The FRI algorithm provides a new perspective for future decision probability conversion and reliability analysis of evidence sources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00848v1-abstract-full').style.display = 'none'; document.getElementById('2411.00848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00442">arXiv:2411.00442</a> <span> [<a href="https://arxiv.org/pdf/2411.00442">pdf</a>, <a href="https://arxiv.org/format/2411.00442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> FPRev: Revealing the Order of Floating-Point Summation by Numerical Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+P">Peichen Xie</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yanjie Gao</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jilong Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00442v1-abstract-short" style="display: inline;"> The order of floating-point summation is a key factor in numerical reproducibility. However, this critical information is generally unspecified and unknown for most summation-based functions in numerical libraries, making it challenging to migrate them to new environments reproducibly. This paper presents novel, non-intrusive, testing-based algorithms that can reveal the order of floating-point su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00442v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00442v1-abstract-full" style="display: none;"> The order of floating-point summation is a key factor in numerical reproducibility. However, this critical information is generally unspecified and unknown for most summation-based functions in numerical libraries, making it challenging to migrate them to new environments reproducibly. This paper presents novel, non-intrusive, testing-based algorithms that can reveal the order of floating-point summation by treating functions as callable black boxes. By constructing well-designed input that can cause the swamping phenomenon of floating-point addition, we can infer the order of summation from the output. We introduce FPRev, a tool that implements these algorithms, and validate its efficiency through extensive experiments with popular numerical libraries on various CPUs and GPUs (including those with Tensor Cores). FPRev reveals the varying summation orders across different libraries and devices, and outperforms other methods in terms of time complexity. The source code of FPRev is at \url{https://github.com/microsoft/RepDL/tree/main/tools/FPRev}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00442v1-abstract-full').style.display = 'none'; document.getElementById('2411.00442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00316">arXiv:2411.00316</a> <span> [<a href="https://arxiv.org/pdf/2411.00316">pdf</a>, <a href="https://arxiv.org/format/2411.00316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Quantum Entanglement Path Selection and Qubit Allocation via Adversarial Group Neural Bandits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yin Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00316v1-abstract-short" style="display: inline;"> Quantum Data Networks (QDNs) have emerged as a promising framework in the field of information processing and transmission, harnessing the principles of quantum mechanics. QDNs utilize a quantum teleportation technique through long-distance entanglement connections, encoding data information in quantum bits (qubits). Despite being a cornerstone in various quantum applications, quantum entanglement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00316v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00316v1-abstract-full" style="display: none;"> Quantum Data Networks (QDNs) have emerged as a promising framework in the field of information processing and transmission, harnessing the principles of quantum mechanics. QDNs utilize a quantum teleportation technique through long-distance entanglement connections, encoding data information in quantum bits (qubits). Despite being a cornerstone in various quantum applications, quantum entanglement encounters challenges in establishing connections over extended distances due to probabilistic processes influenced by factors like optical fiber losses. The creation of long-distance entanglement connections between quantum computers involves multiple entanglement links and entanglement swapping techniques through successive quantum nodes, including quantum computers and quantum repeaters, necessitating optimal path selection and qubit allocation. Current research predominantly assumes known success rates of entanglement links between neighboring quantum nodes and overlooks potential network attackers. This paper addresses the online challenge of optimal path selection and qubit allocation, aiming to learn the best strategy for achieving the highest success rate of entanglement connections between two chosen quantum computers without prior knowledge of the success rate and in the presence of a QDN attacker. The proposed approach is based on multi-armed bandits, specifically adversarial group neural bandits, which treat each path as a group and view qubit allocation as arm selection. Our contributions encompass formulating an online adversarial optimization problem, introducing the EXPNeuralUCB bandits algorithm with theoretical performance guarantees, and conducting comprehensive simulations to showcase its superiority over established advanced algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00316v1-abstract-full').style.display = 'none'; document.getElementById('2411.00316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE/ACM Transactions on Networking</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00075">arXiv:2411.00075</a> <span> [<a href="https://arxiv.org/pdf/2411.00075">pdf</a>, <a href="https://arxiv.org/format/2411.00075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> $\boldsymbol渭\mathbf{P^2}$: Effective Sharpness Aware Minimization Requires Layerwise Perturbation Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haas%2C+M">Moritz Haas</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jin Xu</a>, <a href="/search/cs?searchtype=author&query=Cevher%2C+V">Volkan Cevher</a>, <a href="/search/cs?searchtype=author&query=Vankadara%2C+L+C">Leena Chennuru Vankadara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00075v1-abstract-short" style="display: inline;"> Sharpness Aware Minimization (SAM) enhances performance across various neural architectures and datasets. As models are continually scaled up to improve performance, a rigorous understanding of SAM's scaling behaviour is paramount. To this end, we study the infinite-width limit of neural networks trained with SAM, using the Tensor Programs framework. Our findings reveal that the dynamics of standa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00075v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00075v1-abstract-full" style="display: none;"> Sharpness Aware Minimization (SAM) enhances performance across various neural architectures and datasets. As models are continually scaled up to improve performance, a rigorous understanding of SAM's scaling behaviour is paramount. To this end, we study the infinite-width limit of neural networks trained with SAM, using the Tensor Programs framework. Our findings reveal that the dynamics of standard SAM effectively reduce to applying SAM solely in the last layer in wide neural networks, even with optimal hyperparameters. In contrast, we identify a stable parameterization with layerwise perturbation scaling, which we call $\textit{Maximal Update and Perturbation Parameterization}$ ($渭$P$^2$), that ensures all layers are both feature learning and effectively perturbed in the limit. Through experiments with MLPs, ResNets and Vision Transformers, we empirically demonstrate that $渭$P$^2$ is the first parameterization to achieve hyperparameter transfer of the joint optimum of learning rate and perturbation radius across model scales. Moreover, we provide an intuitive condition to derive $渭$P$^2$ for other perturbation rules like Adaptive SAM and SAM-ON, also ensuring balanced perturbation effects across all layers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00075v1-abstract-full').style.display = 'none'; document.getElementById('2411.00075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24200">arXiv:2410.24200</a> <span> [<a href="https://arxiv.org/pdf/2410.24200">pdf</a>, <a href="https://arxiv.org/format/2410.24200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Length-Induced Embedding Collapse in Transformer-based Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Sunhao Dai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhanshuo Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24200v1-abstract-short" style="display: inline;"> Text embeddings enable various applications, but their performance deteriorates on longer texts. In this paper, we find that the performance degradation is due to a phenomenon called Length Collapse, where longer text embeddings collapse into a narrow space. This collapse results in a distributional inconsistency between embeddings of different text lengths, ultimately hurting the performance of d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24200v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24200v1-abstract-full" style="display: none;"> Text embeddings enable various applications, but their performance deteriorates on longer texts. In this paper, we find that the performance degradation is due to a phenomenon called Length Collapse, where longer text embeddings collapse into a narrow space. This collapse results in a distributional inconsistency between embeddings of different text lengths, ultimately hurting the performance of downstream tasks. Theoretically, by considering the self-attention mechanism inherently functions as a low-pass filter, we prove that long sequences increase the attenuation rate of the low-pass filter effect of the self-attention mechanism. With layers going deeper, excessive low-pass filtering causes the token signals to retain only their Direct-Current (DC) component, which means the input token feature maps will collapse into a narrow space, especially in long texts. Based on the above analysis, we propose to mitigate the undesirable length collapse limitation by introducing a temperature in softmax(), which achieves a higher low-filter attenuation rate. The tuning-free method, called TempScale, can be plugged into multiple transformer-based embedding models. Empirically, we demonstrate that TempScale can improve existing embedding models, especially on long text inputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text Embedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from LongEmbed, which specifically focuses on long context retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24200v1-abstract-full').style.display = 'none'; document.getElementById('2410.24200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22772">arXiv:2410.22772</a> <span> [<a href="https://arxiv.org/pdf/2410.22772">pdf</a>, <a href="https://arxiv.org/format/2410.22772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reliability Assessment of Information Sources Based on Random Permutation Set </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Juntao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+T">Tianxiang Zhan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yong Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22772v1-abstract-short" style="display: inline;"> In pattern recognition, handling uncertainty is a critical challenge that significantly affects decision-making and classification accuracy. Dempster-Shafer Theory (DST) is an effective reasoning framework for addressing uncertainty, and the Random Permutation Set (RPS) extends DST by additionally considering the internal order of elements, forming a more ordered extension of DST. However, there i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22772v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22772v1-abstract-full" style="display: none;"> In pattern recognition, handling uncertainty is a critical challenge that significantly affects decision-making and classification accuracy. Dempster-Shafer Theory (DST) is an effective reasoning framework for addressing uncertainty, and the Random Permutation Set (RPS) extends DST by additionally considering the internal order of elements, forming a more ordered extension of DST. However, there is a lack of a transformation method based on permutation order between RPS and DST, as well as a sequence-based probability transformation method for RPS. Moreover, the reliability of RPS sources remains an issue that requires attention. To address these challenges, this paper proposes an RPS transformation approach and a probability transformation method tailored for RPS. On this basis, a reliability computation method for RPS sources, based on the RPS probability transformation, is introduced and applied to pattern recognition. Experimental results demonstrate that the proposed approach effectively bridges the gap between DST and RPS and achieves superior recognition accuracy in classification problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22772v1-abstract-full').style.display = 'none'; document.getElementById('2410.22772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21017">arXiv:2410.21017</a> <span> [<a href="https://arxiv.org/pdf/2410.21017">pdf</a>, <a href="https://arxiv.org/format/2410.21017">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Edge Perception: Intelligent Wireless Sensing at Network Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuanhao Cui</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaowen Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiali Nie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21017v1-abstract-short" style="display: inline;"> Future sixth-generation (6G) networks are envisioned to support intelligent applications across various vertical scenarios, which have stringent requirements on high-precision sensing as well as ultra-low-latency data processing and decision making. Towards this end, a new paradigm of edge perception networks emerges, which integrates wireless sensing, communication, computation, and artificial in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21017v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21017v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21017v1-abstract-full" style="display: none;"> Future sixth-generation (6G) networks are envisioned to support intelligent applications across various vertical scenarios, which have stringent requirements on high-precision sensing as well as ultra-low-latency data processing and decision making. Towards this end, a new paradigm of edge perception networks emerges, which integrates wireless sensing, communication, computation, and artificial intelligence (AI) capabilities at network edge for intelligent sensing and data processing. This article provides a timely overview on this emerging topic. We commence by discussing wireless edge perception, including physical layer transceiver design, network-wise cooperation, and application-specific data analytics, for which the prospects and challenges are emphasized. Next, we discuss the interplay between edge AI and wireless sensing in edge perception, and present various key techniques for two paradigms, namely edge AI empowered sensing and task-oriented sensing for edge AI, respectively. Finally, we emphasize interesting research directions on edge perception to motivate future works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21017v1-abstract-full').style.display = 'none'; document.getElementById('2410.21017v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20957">arXiv:2410.20957</a> <span> [<a href="https://arxiv.org/pdf/2410.20957">pdf</a>, <a href="https://arxiv.org/format/2410.20957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neuro-symbolic Learning Yielding Logical Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yunpeng Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Taolue Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaoxing Ma</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20957v1-abstract-short" style="display: inline;"> Neuro-symbolic systems combine the abilities of neural perception and logical reasoning. However, end-to-end learning of neuro-symbolic systems is still an unsolved challenge. This paper proposes a natural framework that fuses neural network training, symbol grounding, and logical constraint synthesis into a coherent and efficient end-to-end learning process. The capability of this framework comes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20957v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20957v1-abstract-full" style="display: none;"> Neuro-symbolic systems combine the abilities of neural perception and logical reasoning. However, end-to-end learning of neuro-symbolic systems is still an unsolved challenge. This paper proposes a natural framework that fuses neural network training, symbol grounding, and logical constraint synthesis into a coherent and efficient end-to-end learning process. The capability of this framework comes from the improved interactions between the neural and the symbolic parts of the system in both the training and inference stages. Technically, to bridge the gap between the continuous neural network and the discrete logical constraint, we introduce a difference-of-convex programming technique to relax the logical constraints while maintaining their precision. We also employ cardinality constraints as the language for logical constraint learning and incorporate a trust region method to avoid the degeneracy of logical constraint in learning. Both theoretical analyses and empirical evaluations substantiate the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20957v1-abstract-full').style.display = 'none'; document.getElementById('2410.20957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2023, and code is available at [this url](https://github.com/Lizn-zn/Nesy-Programming)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20900">arXiv:2410.20900</a> <span> [<a href="https://arxiv.org/pdf/2410.20900">pdf</a>, <a href="https://arxiv.org/format/2410.20900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Parameterized Approximation for Capacitated $d$-Hitting Set with Hard Capacities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lokshtanov%2C+D">Daniel Lokshtanov</a>, <a href="/search/cs?searchtype=author&query=Sahu%2C+A">Abhishek Sahu</a>, <a href="/search/cs?searchtype=author&query=Saurabh%2C+S">Saket Saurabh</a>, <a href="/search/cs?searchtype=author&query=Surianarayanan%2C+V">Vaishali Surianarayanan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jie Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20900v1-abstract-short" style="display: inline;"> The \textsc{Capacitated $d$-Hitting Set} problem involves a universe $U$ with a capacity function $\mathsf{cap}: U \rightarrow \mathbb{N}$ and a collection $\mathcal{A}$ of subsets of $U$, each of size at most $d$. The goal is to find a minimum subset $S \subseteq U$ and an assignment $蠁: \mathcal{A} \rightarrow S$ such that for every $A \in \mathcal{A}$, $蠁(A) \in A$, and for each $x \in U$,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20900v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20900v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20900v1-abstract-full" style="display: none;"> The \textsc{Capacitated $d$-Hitting Set} problem involves a universe $U$ with a capacity function $\mathsf{cap}: U \rightarrow \mathbb{N}$ and a collection $\mathcal{A}$ of subsets of $U$, each of size at most $d$. The goal is to find a minimum subset $S \subseteq U$ and an assignment $蠁: \mathcal{A} \rightarrow S$ such that for every $A \in \mathcal{A}$, $蠁(A) \in A$, and for each $x \in U$, $|蠁^{-1}(x)| \leq \mathsf{cap}(x)$. For $d=2$, this is known as \textsc{Capacitated Vertex Cover}. In the weighted variant, each element of $U$ has a positive integer weight, with the objective of finding a minimum-weight capacitated hitting set. Chuzhoy and Naor [SICOMP 2006] provided a factor-3 approximation for \textsc{Capacitated Vertex Cover} and showed that the weighted case lacks an $o(\log n)$-approximation unless $P=NP$. Kao and Wong [SODA 2017] later independently achieved a $d$-approximation for \textsc{Capacitated $d$-Hitting Set}, with no $d - 蔚$ improvements possible under the Unique Games Conjecture. Our main result is a parameterized approximation algorithm with runtime $\left(\frac{k}蔚\right)^k 2^{k^{O(kd)}}(|U|+|\mathcal{A}|)^{O(1)}$ that either concludes no solution of size $\leq k$ exists or finds $S$ of size $\leq 4/3 \cdot k$ and weight at most $2+蔚$ times the minimum weight for solutions of size $\leq k$. We further show that no FPT-approximation with factor $c > 1$ exists for unweighted \textsc{Capacitated $d$-Hitting Set} with $d \geq 3$, nor with factor $2 - 蔚$ for the weighted version, assuming the Exponential Time Hypothesis. These results extend to \textsc{Capacitated Vertex Cover} in multigraphs. Additionally, a variant of multi-dimensional \textsc{Knapsack} is shown hard to FPT-approximate within $2 - 蔚$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20900v1-abstract-full').style.display = 'none'; document.getElementById('2410.20900v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SODA 2025, Abstract is shortened due to space requirement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20815">arXiv:2410.20815</a> <span> [<a href="https://arxiv.org/pdf/2410.20815">pdf</a>, <a href="https://arxiv.org/format/2410.20815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Grid4D: 4D Decomposed Hash Encoding for High-fidelity Dynamic Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiawei Xu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zexin Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jin Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20815v1-abstract-short" style="display: inline;"> Recently, Gaussian splatting has received more and more attention in the field of static scene rendering. Due to the low computational overhead and inherent flexibility of explicit representations, plane-based explicit methods are popular ways to predict deformations for Gaussian-based dynamic scene rendering models. However, plane-based methods rely on the inappropriate low-rank assumption and ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20815v1-abstract-full" style="display: none;"> Recently, Gaussian splatting has received more and more attention in the field of static scene rendering. Due to the low computational overhead and inherent flexibility of explicit representations, plane-based explicit methods are popular ways to predict deformations for Gaussian-based dynamic scene rendering models. However, plane-based methods rely on the inappropriate low-rank assumption and excessively decompose the space-time 4D encoding, resulting in overmuch feature overlap and unsatisfactory rendering quality. To tackle these problems, we propose Grid4D, a dynamic scene rendering model based on Gaussian splatting and employing a novel explicit encoding method for the 4D input through the hash encoding. Different from plane-based explicit representations, we decompose the 4D encoding into one spatial and three temporal 3D hash encodings without the low-rank assumption. Additionally, we design a novel attention module that generates the attention scores in a directional range to aggregate the spatial and temporal features. The directional attention enables Grid4D to more accurately fit the diverse deformations across distinct scene components based on the spatial encoded features. Moreover, to mitigate the inherent lack of smoothness in explicit representation methods, we introduce a smooth regularization term that keeps our model from the chaos of deformation prediction. Our experiments demonstrate that Grid4D significantly outperforms the state-of-the-art models in visual quality and rendering speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20815v1-abstract-full').style.display = 'none'; document.getElementById('2410.20815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20750">arXiv:2410.20750</a> <span> [<a href="https://arxiv.org/pdf/2410.20750">pdf</a>, <a href="https://arxiv.org/format/2410.20750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ODRL: A Benchmark for Off-Dynamics Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+J">Jiafei Lyu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kang Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengbei Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingwen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongzhang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+C">Chenjia Bai</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20750v1-abstract-short" style="display: inline;"> We consider off-dynamics reinforcement learning (RL) where one needs to transfer policies across different domains with dynamics mismatch. Despite the focus on developing dynamics-aware algorithms, this field is hindered due to the lack of a standard benchmark. To bridge this gap, we introduce ODRL, the first benchmark tailored for evaluating off-dynamics RL methods. ODRL contains four experimenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20750v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20750v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20750v1-abstract-full" style="display: none;"> We consider off-dynamics reinforcement learning (RL) where one needs to transfer policies across different domains with dynamics mismatch. Despite the focus on developing dynamics-aware algorithms, this field is hindered due to the lack of a standard benchmark. To bridge this gap, we introduce ODRL, the first benchmark tailored for evaluating off-dynamics RL methods. ODRL contains four experimental settings where the source and target domains can be either online or offline, and provides diverse tasks and a broad spectrum of dynamics shifts, making it a reliable platform to comprehensively evaluate the agent's adaptation ability to the target domain. Furthermore, ODRL includes recent off-dynamics RL algorithms in a unified framework and introduces some extra baselines for different settings, all implemented in a single-file manner. To unpack the true adaptation capability of existing methods, we conduct extensive benchmarking experiments, which show that no method has universal advantages across varied dynamics shifts. We hope this benchmark can serve as a cornerstone for future research endeavors. Our code is publicly available at https://github.com/OffDynamicsRL/off-dynamics-rl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20750v1-abstract-full').style.display = 'none'; document.getElementById('2410.20750v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 D&B Track</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xue%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository