Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 64 results for author: <span class="mathjax">Qiao, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Qiao%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Qiao, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Qiao%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Qiao, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Qiao%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Qiao%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Qiao%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14933">arXiv:2501.14933</a> <span> [<a href="https://arxiv.org/pdf/2501.14933">pdf</a>, <a href="https://arxiv.org/format/2501.14933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conformal Inference of Individual Treatment Effects Using Conditional Density Estimates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baozhen Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingye Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14933v1-abstract-short" style="display: inline;"> In an era where diverse and complex data are increasingly accessible, the precise prediction of individual treatment effects (ITE) becomes crucial across fields such as healthcare, economics, and public policy. Current state-of-the-art approaches, while providing valid prediction intervals through Conformal Quantile Regression (CQR) and related techniques, often yield overly conservative predictio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14933v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14933v1-abstract-full" style="display: none;"> In an era where diverse and complex data are increasingly accessible, the precise prediction of individual treatment effects (ITE) becomes crucial across fields such as healthcare, economics, and public policy. Current state-of-the-art approaches, while providing valid prediction intervals through Conformal Quantile Regression (CQR) and related techniques, often yield overly conservative prediction intervals. In this work, we introduce a conformal inference approach to ITE using the conditional density of the outcome given the covariates. We leverage the reference distribution technique to efficiently estimate the conditional densities as the score functions under a two-stage conformal ITE framework. We show that our prediction intervals are not only marginally valid but are narrower than existing methods. Experimental results further validate the usefulness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14933v1-abstract-full').style.display = 'none'; document.getElementById('2501.14933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03072">arXiv:2412.03072</a> <span> [<a href="https://arxiv.org/pdf/2412.03072">pdf</a>, <a href="https://arxiv.org/format/2412.03072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Preference-based opponent shaping in differentiable games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinyu Qiao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yudong Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Congying Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiyan Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Tiande Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03072v1-abstract-short" style="display: inline;"> Strategy learning in game environments with multi-agent is a challenging problem. Since each agent's reward is determined by the joint strategy, a greedy learning strategy that aims to maximize its own reward may fall into a local optimum. Recent studies have proposed the opponent modeling and shaping methods for game environments. These methods enhance the efficiency of strategy learning by model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03072v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03072v1-abstract-full" style="display: none;"> Strategy learning in game environments with multi-agent is a challenging problem. Since each agent's reward is determined by the joint strategy, a greedy learning strategy that aims to maximize its own reward may fall into a local optimum. Recent studies have proposed the opponent modeling and shaping methods for game environments. These methods enhance the efficiency of strategy learning by modeling the strategies and updating processes of other agents. However, these methods often rely on simple predictions of opponent strategy changes. Due to the lack of modeling behavioral preferences such as cooperation and competition, they are usually applicable only to predefined scenarios and lack generalization capabilities. In this paper, we propose a novel Preference-based Opponent Shaping (PBOS) method to enhance the strategy learning process by shaping agents' preferences towards cooperation. We introduce the preference parameter, which is incorporated into the agent's loss function, thus allowing the agent to directly consider the opponent's loss function when updating the strategy. We update the preference parameters concurrently with strategy learning to ensure that agents can adapt to any cooperative or competitive game environment. Through a series of experiments, we verify the performance of PBOS algorithm in a variety of differentiable games. The experimental results show that the PBOS algorithm can guide the agent to learn the appropriate preference parameters, so as to achieve better reward distribution in multiple game environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03072v1-abstract-full').style.display = 'none'; document.getElementById('2412.03072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13159">arXiv:2411.13159</a> <span> [<a href="https://arxiv.org/pdf/2411.13159">pdf</a>, <a href="https://arxiv.org/format/2411.13159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hard-Synth: Synthesizing Diverse Hard Samples for ASR using Zero-Shot TTS and LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jinsong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13159v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13159v1-abstract-full" style="display: none;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large language models (LLMs) and advanced zero-shot TTS. Our approach employs LLMs to generate diverse in-domain text through rewriting, without relying on additional text data. Rather than using predefined speech styles, we introduce a hard prompt selection method with zero-shot TTS to clone speech styles that the ASR model finds challenging to recognize. Experiments demonstrate that Hard-Synth significantly enhances the Conformer model, achieving relative word error rate (WER) reductions of 6.5\%/4.4\% on LibriSpeech dev/test-other subsets. Additionally, we show that Hard-Synth is data-efficient and capable of reducing bias in ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'none'; document.getElementById('2411.13159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01588">arXiv:2410.01588</a> <span> [<a href="https://arxiv.org/pdf/2410.01588">pdf</a>, <a href="https://arxiv.org/format/2410.01588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DynFrs: An Efficient Framework for Machine Unlearning in Random Forest </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shurong Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhuoyang Shen</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinbao Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tongning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01588v1-abstract-short" style="display: inline;"> Random Forests are widely recognized for establishing efficacy in classification and regression tasks, standing out in various domains such as medical diagnosis, finance, and personalized recommendations. These domains, however, are inherently sensitive to privacy concerns, as personal and confidential data are involved. With increasing demand for the right to be forgotten, particularly under regu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01588v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01588v1-abstract-full" style="display: none;"> Random Forests are widely recognized for establishing efficacy in classification and regression tasks, standing out in various domains such as medical diagnosis, finance, and personalized recommendations. These domains, however, are inherently sensitive to privacy concerns, as personal and confidential data are involved. With increasing demand for the right to be forgotten, particularly under regulations such as GDPR and CCPA, the ability to perform machine unlearning has become crucial for Random Forests. However, insufficient attention was paid to this topic, and existing approaches face difficulties in being applied to real-world scenarios. Addressing this gap, we propose the DynFrs framework designed to enable efficient machine unlearning in Random Forests while preserving predictive accuracy. Dynfrs leverages subsampling method Occ(q) and a lazy tag strategy Lzy, and is still adaptable to any Random Forest variant. In essence, Occ(q) ensures that each sample in the training set occurs only in a proportion of trees so that the impact of deleting samples is limited, and Lzy delays the reconstruction of a tree node until necessary, thereby avoiding unnecessary modifications on tree structures. In experiments, applying Dynfrs on Extremely Randomized Trees yields substantial improvements, achieving orders of magnitude faster unlearning performance and better predictive accuracy than existing machine unlearning methods for Random Forests. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01588v1-abstract-full').style.display = 'none'; document.getElementById('2410.01588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13262">arXiv:2409.13262</a> <span> [<a href="https://arxiv.org/pdf/2409.13262">pdf</a>, <a href="https://arxiv.org/format/2409.13262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Should Understand Pinyin for Chinese ASR Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13262v1-abstract-short" style="display: inline;"> Large language models can enhance automatic speech recognition systems through generative error correction. In this paper, we propose Pinyin-enhanced GEC, which leverages Pinyi, the phonetic representation of Mandarin Chinese, as supplementary information to improve Chinese ASR error correction. Our approach only utilizes synthetic errors for training and employs the one-best hypothesis during inf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13262v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13262v1-abstract-full" style="display: none;"> Large language models can enhance automatic speech recognition systems through generative error correction. In this paper, we propose Pinyin-enhanced GEC, which leverages Pinyi, the phonetic representation of Mandarin Chinese, as supplementary information to improve Chinese ASR error correction. Our approach only utilizes synthetic errors for training and employs the one-best hypothesis during inference. Additionally, we introduce a multitask training approach involving conversion tasks between Pinyin and text to align their feature spaces. Experiments on the Aishell-1 and the Common Voice datasets demonstrate that our approach consistently outperforms GEC with text-only input. More importantly, we provide intuitive explanations for the effectiveness of PY-GEC and multitask training from two aspects: 1) increased attention weight on Pinyin features; and 2) aligned feature space between Pinyin and text hidden states. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13262v1-abstract-full').style.display = 'none'; document.getElementById('2409.13262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11693">arXiv:2409.11693</a> <span> [<a href="https://arxiv.org/pdf/2409.11693">pdf</a>, <a href="https://arxiv.org/ps/2409.11693">ps</a>, <a href="https://arxiv.org/format/2409.11693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> On the second-order zero differential properties of several classes of power functions over finite fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huan Zhou</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaoni Du</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingbin Qiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wenping Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11693v2-abstract-short" style="display: inline;"> Feistel Boomerang Connectivity Table (FBCT) is an important cryptanalytic technique on analysing the resistance of the Feistel network-based ciphers to power attacks such as differential and boomerang attacks. Moreover, the coefficients of FBCT are closely related to the second-order zero differential spectra of the function $F(x)$ over the finite fields with even characteristic and the Feistel bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11693v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11693v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11693v2-abstract-full" style="display: none;"> Feistel Boomerang Connectivity Table (FBCT) is an important cryptanalytic technique on analysing the resistance of the Feistel network-based ciphers to power attacks such as differential and boomerang attacks. Moreover, the coefficients of FBCT are closely related to the second-order zero differential spectra of the function $F(x)$ over the finite fields with even characteristic and the Feistel boomerang uniformity is the second-order zero differential uniformity of $F(x)$. In this paper, by computing the number of solutions of specific equations over finite fields, we determine explicitly the second-order zero differential spectra of power functions $x^{2^m+3}$ and $x^{2^m+5}$ with $m>2$ being a positive integer over finite field with even characteristic, and $x^{p^k+1}$ with integer $k\geq1$ over finite field with odd characteristic $p$. It is worth noting that $x^{2^m+3}$ is a permutation over $\mathbb{F}_{2^n}$ and only when $m$ is odd, $x^{2^m+5}$ is a permutation over $\mathbb{F}_{2^n}$, where integer $n=2m$. As a byproduct, we find $F(x)=x^4$ is a PN and second-order zero differentially $0$-uniform function over $\mathbb{F}_{3^n}$ with odd $n$. The computation of these entries and the cardinalities in each table aimed to facilitate the analysis of differential and boomerang cryptanalysis of S-boxes when studying distinguishers and trails. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11693v2-abstract-full').style.display = 'none'; document.getElementById('2409.11693v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10491">arXiv:2409.10491</a> <span> [<a href="https://arxiv.org/pdf/2409.10491">pdf</a>, <a href="https://arxiv.org/format/2409.10491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Radar Teach and Repeat: Architecture and Initial Field Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinyuan Qiao</a>, <a href="/search/cs?searchtype=author&query=Krawciw%2C+A">Alexander Krawciw</a>, <a href="/search/cs?searchtype=author&query=Lilge%2C+S">Sven Lilge</a>, <a href="/search/cs?searchtype=author&query=Barfoot%2C+T+D">Timothy D. Barfoot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10491v1-abstract-short" style="display: inline;"> Frequency-modulated continuous-wave (FMCW) scanning radar has emerged as an alternative to spinning LiDAR for state estimation on mobile robots. Radar's longer wavelength is less affected by small particulates, providing operational advantages in challenging environments such as dust, smoke, and fog. This paper presents Radar Teach and Repeat (RT&R): a full-stack radar system for long-term off-roa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10491v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10491v1-abstract-full" style="display: none;"> Frequency-modulated continuous-wave (FMCW) scanning radar has emerged as an alternative to spinning LiDAR for state estimation on mobile robots. Radar's longer wavelength is less affected by small particulates, providing operational advantages in challenging environments such as dust, smoke, and fog. This paper presents Radar Teach and Repeat (RT&R): a full-stack radar system for long-term off-road robot autonomy. RT&R can drive routes reliably in off-road cluttered areas without any GPS. We benchmark the radar system's closed-loop path-tracking performance and compare it to its 3D LiDAR counterpart. 11.8 km of autonomous driving was completed without interventions using only radar and gyro for navigation. RT&R was evaluated on different routes with progressively less structured scene geometry. RT&R achieved lateral path-tracking root mean squared errors (RMSE) of 5.6 cm, 7.5 cm, and 12.1 cm as the routes became more challenging. On the robot we used for testing, these RMSE values are less than half of the width of one tire (24 cm). These same routes have worst-case errors of 21.7 cm, 24.0 cm, and 43.8 cm. We conclude that radar is a viable alternative to LiDAR for long-term autonomy in challenging off-road scenarios. The implementation of RT&R is open-source and available at: https://github.com/utiasASRL/vtr3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10491v1-abstract-full').style.display = 'none'; document.getElementById('2409.10491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures, submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04145">arXiv:2408.04145</a> <span> [<a href="https://arxiv.org/pdf/2408.04145">pdf</a>, <a href="https://arxiv.org/format/2408.04145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ComKD-CLIP: Comprehensive Knowledge Distillation for Contrastive Language-Image Pre-traning Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaozhen Qiao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhe Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04145v3-abstract-short" style="display: inline;"> Contrastive Language-Image Pre-training (CLIP) models excel in integrating semantic information between images and text through contrastive learning techniques. It has achieved remarkable performance in various multimodal tasks. However, the deployment of large CLIP models is hindered in resource-limited environments, while smaller models frequently fail to meet the performance benchmarks required… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04145v3-abstract-full').style.display = 'inline'; document.getElementById('2408.04145v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04145v3-abstract-full" style="display: none;"> Contrastive Language-Image Pre-training (CLIP) models excel in integrating semantic information between images and text through contrastive learning techniques. It has achieved remarkable performance in various multimodal tasks. However, the deployment of large CLIP models is hindered in resource-limited environments, while smaller models frequently fail to meet the performance benchmarks required for practical applications. In this paper, we propose a novel approach, ComKD-CLIP: Comprehensive Knowledge Distillation for Contrastive Language-Image Pre-traning Model, which aims to comprehensively distill the knowledge from a large teacher CLIP model into a smaller student model, ensuring comparable performance with significantly reduced parameters. ComKD-CLIP is composed of two key mechanisms: Image Feature Alignment (IFAlign) and Educational Attention (EduAttention). IFAlign makes the image features extracted by the student model closely match those extracted by the teacher model, enabling the student to learn teacher's knowledge of extracting image features. EduAttention explores the cross-relationships between text features extracted by the teacher model and image features extracted by the student model, enabling the student model to learn how the teacher model integrates text-image features. In addition, ComKD-CLIP can refine the knowledge distilled from IFAlign and EduAttention by leveraging the text-image feature fusion results of the teacher model, ensuring the student model accurately absorbs the teacher's knowledge. Extensive experiments conducted on 11 datasets have demonstrated the superiority of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04145v3-abstract-full').style.display = 'none'; document.getElementById('2408.04145v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">update</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07594">arXiv:2406.07594</a> <span> [<a href="https://arxiv.org/pdf/2406.07594">pdf</a>, <a href="https://arxiv.org/format/2406.07594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MLLMGuard: A Multi-dimensional Safety Evaluation Suite for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tianle Gu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zeyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kexin Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dandan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haiquan Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuanqi Yao</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingge Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keqing Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+Y">Yan Teng</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingchun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07594v2-abstract-short" style="display: inline;"> Powered by remarkable advancements in Large Language Models (LLMs), Multimodal Large Language Models (MLLMs) demonstrate impressive capabilities in manifold tasks. However, the practical application scenarios of MLLMs are intricate, exposing them to potential malicious instructions and thereby posing safety risks. While current benchmarks do incorporate certain safety considerations, they often la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07594v2-abstract-full').style.display = 'inline'; document.getElementById('2406.07594v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07594v2-abstract-full" style="display: none;"> Powered by remarkable advancements in Large Language Models (LLMs), Multimodal Large Language Models (MLLMs) demonstrate impressive capabilities in manifold tasks. However, the practical application scenarios of MLLMs are intricate, exposing them to potential malicious instructions and thereby posing safety risks. While current benchmarks do incorporate certain safety considerations, they often lack comprehensive coverage and fail to exhibit the necessary rigor and robustness. For instance, the common practice of employing GPT-4V as both the evaluator and a model to be evaluated lacks credibility, as it tends to exhibit a bias toward its own responses. In this paper, we present MLLMGuard, a multidimensional safety evaluation suite for MLLMs, including a bilingual image-text evaluation dataset, inference utilities, and a lightweight evaluator. MLLMGuard's assessment comprehensively covers two languages (English and Chinese) and five important safety dimensions (Privacy, Bias, Toxicity, Truthfulness, and Legality), each with corresponding rich subtasks. Focusing on these dimensions, our evaluation dataset is primarily sourced from platforms such as social media, and it integrates text-based and image-based red teaming techniques with meticulous annotation by human experts. This can prevent inaccurate evaluation caused by data leakage when using open-source datasets and ensures the quality and challenging nature of our benchmark. Additionally, a fully automated lightweight evaluator termed GuardRank is developed, which achieves significantly higher evaluation accuracy than GPT-4. Our evaluation results across 13 advanced models indicate that MLLMs still have a substantial journey ahead before they can be considered safe and responsible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07594v2-abstract-full').style.display = 'none'; document.getElementById('2406.07594v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16220">arXiv:2405.16220</a> <span> [<a href="https://arxiv.org/pdf/2405.16220">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DAFFNet: A Dual Attention Feature Fusion Network for Classification of White Blood Cells </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuzhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zetong Chen</a>, <a href="/search/cs?searchtype=author&query=An%2C+Y">Yunuo An</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chenyang Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xu Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16220v1-abstract-short" style="display: inline;"> The precise categorization of white blood cell (WBC) is crucial for diagnosing blood-related disorders. However, manual analysis in clinical settings is time-consuming, labor-intensive, and prone to errors. Numerous studies have employed machine learning and deep learning techniques to achieve objective WBC classification, yet these studies have not fully utilized the information of WBC images. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16220v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16220v1-abstract-full" style="display: none;"> The precise categorization of white blood cell (WBC) is crucial for diagnosing blood-related disorders. However, manual analysis in clinical settings is time-consuming, labor-intensive, and prone to errors. Numerous studies have employed machine learning and deep learning techniques to achieve objective WBC classification, yet these studies have not fully utilized the information of WBC images. Therefore, our motivation is to comprehensively utilize the morphological information and high-level semantic information of WBC images to achieve accurate classification of WBC. In this study, we propose a novel dual-branch network Dual Attention Feature Fusion Network (DAFFNet), which for the first time integrates the high-level semantic features with morphological features of WBC to achieve accurate classification. Specifically, we introduce a dual attention mechanism, which enables the model to utilize the channel features and spatially localized features of the image more comprehensively. Morphological Feature Extractor (MFE), comprising Morphological Attributes Predictor (MAP) and Morphological Attributes Encoder (MAE), is proposed to extract the morphological features of WBC. We also implement Deep-supervised Learning (DSL) and Semi-supervised Learning (SSL) training strategies for MAE to enhance its performance. Our proposed network framework achieves 98.77%, 91.30%, 98.36%, 99.71%, 98.45%, and 98.85% overall accuracy on the six public datasets PBC, LISC, Raabin-WBC, BCCD, LDWBC, and Labelled, respectively, demonstrating superior effectiveness compared to existing studies. The results indicate that the WBC classification combining high-level semantic features and low-level morphological features is of great significance, which lays the foundation for objective and accurate classification of WBC in microscopic blood cell images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16220v1-abstract-full').style.display = 'none'; document.getElementById('2405.16220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10357">arXiv:2405.10357</a> <span> [<a href="https://arxiv.org/pdf/2405.10357">pdf</a>, <a href="https://arxiv.org/format/2405.10357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RGB Guided ToF Imaging System: A Survey of Deep Learning-based Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xin Qiao</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pengchao Deng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10357v1-abstract-short" style="display: inline;"> Integrating an RGB camera into a ToF imaging system has become a significant technique for perceiving the real world. The RGB guided ToF imaging system is crucial to several applications, including face anti-spoofing, saliency detection, and trajectory prediction. Depending on the distance of the working range, the implementation schemes of the RGB guided ToF imaging systems are different. Specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10357v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10357v1-abstract-full" style="display: none;"> Integrating an RGB camera into a ToF imaging system has become a significant technique for perceiving the real world. The RGB guided ToF imaging system is crucial to several applications, including face anti-spoofing, saliency detection, and trajectory prediction. Depending on the distance of the working range, the implementation schemes of the RGB guided ToF imaging systems are different. Specifically, ToF sensors with a uniform field of illumination, which can output dense depth but have low resolution, are typically used for close-range measurements. In contrast, LiDARs, which emit laser pulses and can only capture sparse depth, are usually employed for long-range detection. In the two cases, depth quality improvement for RGB guided ToF imaging corresponds to two sub-tasks: guided depth super-resolution and guided depth completion. In light of the recent significant boost to the field provided by deep learning, this paper comprehensively reviews the works related to RGB guided ToF imaging, including network structures, learning strategies, evaluation metrics, benchmark datasets, and objective functions. Besides, we present quantitative comparisons of state-of-the-art methods on widely used benchmark datasets. Finally, we discuss future trends and the challenges in real applications for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10357v1-abstract-full').style.display = 'none'; document.getElementById('2405.10357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear on International Journal of Computer Vision (IJCV)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04393">arXiv:2405.04393</a> <span> [<a href="https://arxiv.org/pdf/2405.04393">pdf</a>, <a href="https://arxiv.org/format/2405.04393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Online Set-valued Classification with Bandit Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhou Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingye Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04393v1-abstract-short" style="display: inline;"> Conformal prediction is a distribution-free method that wraps a given machine learning model and returns a set of plausible labels that contain the true label with a prescribed coverage rate. In practice, the empirical coverage achieved highly relies on fully observed label information from data both in the training phase for model fitting and the calibration phase for quantile estimation. This de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04393v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04393v1-abstract-full" style="display: none;"> Conformal prediction is a distribution-free method that wraps a given machine learning model and returns a set of plausible labels that contain the true label with a prescribed coverage rate. In practice, the empirical coverage achieved highly relies on fully observed label information from data both in the training phase for model fitting and the calibration phase for quantile estimation. This dependency poses a challenge in the context of online learning with bandit feedback, where a learner only has access to the correctness of actions (i.e., pulled an arm) but not the full information of the true label. In particular, when the pulled arm is incorrect, the learner only knows that the pulled one is not the true class label, but does not know which label is true. Additionally, bandit feedback further results in a smaller labeled dataset for calibration, limited to instances with correct actions, thereby affecting the accuracy of quantile estimation. To address these limitations, we propose Bandit Class-specific Conformal Prediction (BCCP), offering coverage guarantees on a class-specific granularity. Using an unbiased estimation of an estimand involving the true label, BCCP trains the model and makes set-valued inferences through stochastic gradient descent. Our approach overcomes the challenges of sparsely labeled data in each iteration and generalizes the reliability and applicability of conformal prediction to online decision-making environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04393v1-abstract-full').style.display = 'none'; document.getElementById('2405.04393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02958">arXiv:2405.02958</a> <span> [<a href="https://arxiv.org/pdf/2405.02958">pdf</a>, <a href="https://arxiv.org/format/2405.02958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Score-based Generative Priors Guided Model-driven Network for MRI Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaoyu Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weisheng Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+B">Bin Xiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuping Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lijian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02958v2-abstract-short" style="display: inline;"> Score matching with Langevin dynamics (SMLD) method has been successfully applied to accelerated MRI. However, the hyperparameters in the sampling process require subtle tuning, otherwise the results can be severely corrupted by hallucination artifacts, especially with out-of-distribution test data. To address the limitations, we proposed a novel workflow where naive SMLD samples serve as addition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02958v2-abstract-full').style.display = 'inline'; document.getElementById('2405.02958v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02958v2-abstract-full" style="display: none;"> Score matching with Langevin dynamics (SMLD) method has been successfully applied to accelerated MRI. However, the hyperparameters in the sampling process require subtle tuning, otherwise the results can be severely corrupted by hallucination artifacts, especially with out-of-distribution test data. To address the limitations, we proposed a novel workflow where naive SMLD samples serve as additional priors to guide model-driven network training. First, we adopted a pretrained score network to generate samples as preliminary guidance images (PGI), obviating the need for network retraining, parameter tuning and in-distribution test data. Although PGIs are corrupted by hallucination artifacts, we believe they can provide extra information through effective denoising steps to facilitate reconstruction. Therefore, we designed a denoising module (DM) in the second step to coarsely eliminate artifacts and noises from PGIs. The features are extracted from a score-based information extractor (SIE) and a cross-domain information extractor (CIE), which directly map to the noise patterns. Third, we designed a model-driven network guided by denoised PGIs (DGIs) to further recover fine details. DGIs are densely connected with intermediate reconstructions in each cascade to enrich the information and are periodically updated to provide more accurate guidance. Our experiments on different datasets reveal that despite the low average quality of PGIs, the proposed workflow can effectively extract valuable information to guide the network training, even with severely reduced training data and sampling steps. Our method outperforms other cutting-edge techniques by effectively mitigating hallucination artifacts, yielding robust and high-quality reconstruction results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02958v2-abstract-full').style.display = 'none'; document.getElementById('2405.02958v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13166">arXiv:2404.13166</a> <span> [<a href="https://arxiv.org/pdf/2404.13166">pdf</a>, <a href="https://arxiv.org/format/2404.13166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FoMo: A Proposal for a Multi-Season Dataset for Robot Navigation in For锚t Montmorency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Boxan%2C+M">Mat臎j Boxan</a>, <a href="/search/cs?searchtype=author&query=Krawciw%2C+A">Alexander Krawciw</a>, <a href="/search/cs?searchtype=author&query=Daum%2C+E">Effie Daum</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinyuan Qiao</a>, <a href="/search/cs?searchtype=author&query=Lilge%2C+S">Sven Lilge</a>, <a href="/search/cs?searchtype=author&query=Barfoot%2C+T+D">Timothy D. Barfoot</a>, <a href="/search/cs?searchtype=author&query=Pomerleau%2C+F">Fran莽ois Pomerleau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13166v1-abstract-short" style="display: inline;"> In this paper, we propose the FoMo (For锚t Montmorency) dataset: a comprehensive, multi-season data collection. Located in the Montmorency Forest, Quebec, Canada, our dataset will capture a rich variety of sensory data over six distinct trajectories totaling 6 kilometers, repeated through different seasons to accumulate 42 kilometers of recorded data. The boreal forest environment increases the div… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13166v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13166v1-abstract-full" style="display: none;"> In this paper, we propose the FoMo (For锚t Montmorency) dataset: a comprehensive, multi-season data collection. Located in the Montmorency Forest, Quebec, Canada, our dataset will capture a rich variety of sensory data over six distinct trajectories totaling 6 kilometers, repeated through different seasons to accumulate 42 kilometers of recorded data. The boreal forest environment increases the diversity of datasets for mobile robot navigation. This proposed dataset will feature a broad array of sensor modalities, including lidar, radar, and a navigation-grade Inertial Measurement Unit (IMU), against the backdrop of challenging boreal forest conditions. Notably, the FoMo dataset will be distinguished by its inclusion of seasonal variations, such as changes in tree canopy and snow depth up to 2 meters, presenting new challenges for robot navigation algorithms. Alongside, we will offer a centimeter-level accurate ground truth, obtained through Post Processed Kinematic (PPK) Global Navigation Satellite System (GNSS) correction, facilitating precise evaluation of odometry and localization algorithms. This work aims to spur advancements in autonomous navigation, enabling the development of robust algorithms capable of handling the dynamic, unstructured environments characteristic of boreal forests. With a public odometry and localization leaderboard and a dedicated software suite, we invite the robotics community to engage with the FoMo dataset by exploring new frontiers in robot navigation under extreme environmental variations. We seek feedback from the community based on this proposal to make the dataset as useful as possible. For further details and supplementary materials, please visit https://norlab-ulaval.github.io/FoMo-website/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13166v1-abstract-full').style.display = 'none'; document.getElementById('2404.13166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the IEEE ICRA Workshop on Field Robotics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12693">arXiv:2404.12693</a> <span> [<a href="https://arxiv.org/pdf/2404.12693">pdf</a>, <a href="https://arxiv.org/format/2404.12693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Chinese Character Representation with Formation Tree </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yang Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinfei Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaojun Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junsong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12693v1-abstract-short" style="display: inline;"> Learning effective representations for Chinese characters presents unique challenges, primarily due to the vast number of characters and their continuous growth, which requires models to handle an expanding category space. Additionally, the inherent sparsity of character usage complicates the generalization of learned representations. Prior research has explored radical-based sequences to overcome… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12693v1-abstract-full').style.display = 'inline'; document.getElementById('2404.12693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12693v1-abstract-full" style="display: none;"> Learning effective representations for Chinese characters presents unique challenges, primarily due to the vast number of characters and their continuous growth, which requires models to handle an expanding category space. Additionally, the inherent sparsity of character usage complicates the generalization of learned representations. Prior research has explored radical-based sequences to overcome these issues, achieving progress in recognizing unseen characters. However, these approaches fail to fully exploit the inherent tree structure of such sequences. To address these limitations and leverage established data properties, we propose Formation Tree-CLIP (FT-CLIP). This model utilizes formation trees to represent characters and incorporates a dedicated tree encoder, significantly improving performance in both seen and unseen character recognition tasks. We further introduce masking for to both character images and tree nodes, enabling efficient and effective training. This approach accelerates training significantly (by a factor of 2 or more) while enhancing accuracy. Extensive experiments show that processing characters through formation trees aligns better with their inherent properties than direct sequential methods, significantly enhancing the generality and usability of the representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12693v1-abstract-full').style.display = 'none'; document.getElementById('2404.12693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09640">arXiv:2404.09640</a> <span> [<a href="https://arxiv.org/pdf/2404.09640">pdf</a>, <a href="https://arxiv.org/format/2404.09640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CREST: Cross-modal Resonance through Evidential Deep Learning for Enhanced Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haojian Huang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaozhen Qiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haodong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingyu Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhe Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mulin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09640v4-abstract-short" style="display: inline;"> Zero-shot learning (ZSL) enables the recognition of novel classes by leveraging semantic knowledge transfer from known to unknown categories. This knowledge, typically encapsulated in attribute descriptions, aids in identifying class-specific visual features, thus facilitating visual-semantic alignment and improving ZSL performance. However, real-world challenges such as distribution imbalances an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09640v4-abstract-full').style.display = 'inline'; document.getElementById('2404.09640v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09640v4-abstract-full" style="display: none;"> Zero-shot learning (ZSL) enables the recognition of novel classes by leveraging semantic knowledge transfer from known to unknown categories. This knowledge, typically encapsulated in attribute descriptions, aids in identifying class-specific visual features, thus facilitating visual-semantic alignment and improving ZSL performance. However, real-world challenges such as distribution imbalances and attribute co-occurrence among instances often hinder the discernment of local variances in images, a problem exacerbated by the scarcity of fine-grained, region-specific attribute annotations. Moreover, the variability in visual presentation within categories can also skew attribute-category associations. In response, we propose a bidirectional cross-modal ZSL approach CREST. It begins by extracting representations for attribute and visual localization and employs Evidential Deep Learning (EDL) to measure underlying epistemic uncertainty, thereby enhancing the model's resilience against hard negatives. CREST incorporates dual learning pathways, focusing on both visual-category and attribute-category alignments, to ensure robust correlation between latent and observable spaces. Moreover, we introduce an uncertainty-informed cross-modal fusion technique to refine visual-attribute inference. Extensive experiments demonstrate our model's effectiveness and unique explainability across multiple datasets. Our code and data are available at: https://github.com/JethroJames/CREST <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09640v4-abstract-full').style.display = 'none'; document.getElementById('2404.09640v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2024; 10 pages, 2 Tables, 9 Figures; Repo is available at: https://github.com/JethroJames/CREST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01712">arXiv:2404.01712</a> <span> [<a href="https://arxiv.org/pdf/2404.01712">pdf</a>, <a href="https://arxiv.org/format/2404.01712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hessian-Free Online Certified Unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinbao Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Ming Tang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+E">Ermin Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01712v4-abstract-short" style="display: inline;"> Machine unlearning strives to uphold the data owners' right to be forgotten by enabling models to selectively forget specific data. Recent advances suggest pre-computing and storing statistics extracted from second-order information and implementing unlearning through Newton-style updates. However, the Hessian matrix operations are extremely costly and previous works conduct unlearning for empiric… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01712v4-abstract-full').style.display = 'inline'; document.getElementById('2404.01712v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01712v4-abstract-full" style="display: none;"> Machine unlearning strives to uphold the data owners' right to be forgotten by enabling models to selectively forget specific data. Recent advances suggest pre-computing and storing statistics extracted from second-order information and implementing unlearning through Newton-style updates. However, the Hessian matrix operations are extremely costly and previous works conduct unlearning for empirical risk minimizer with the convexity assumption, precluding their applicability to high-dimensional over-parameterized models and the nonconvergence condition. In this paper, we propose an efficient Hessian-free unlearning approach. The key idea is to maintain a statistical vector for each training data, computed through affine stochastic recursion of the difference between the retrained and learned models. We prove that our proposed method outperforms the state-of-the-art methods in terms of the unlearning and generalization guarantees, the deletion capacity, and the time/storage complexity, under the same regularity conditions. Through the strategy of recollecting statistics for removing data, we develop an online unlearning algorithm that achieves near-instantaneous data removal, as it requires only vector addition. Experiments demonstrate that our proposed scheme surpasses existing results by orders of magnitude in terms of time/storage costs with millisecond-level unlearning execution, while also enhancing test accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01712v4-abstract-full').style.display = 'none'; document.getElementById('2404.01712v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15032">arXiv:2403.15032</a> <span> [<a href="https://arxiv.org/pdf/2403.15032">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Integrated Neighborhood and Scale Information Network for Open-Pit Mine Change Detection in High-Resolution Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zilin Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kangning Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinbao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinzhong Yang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaojun Qiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+D">Deshuai Yuan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+C">Cheng Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15032v1-abstract-short" style="display: inline;"> Open-pit mine change detection (CD) in high-resolution (HR) remote sensing images plays a crucial role in mineral development and environmental protection. Significant progress has been made in this field in recent years, largely due to the advancement of deep learning techniques. However, existing deep-learning-based CD methods encounter challenges in effectively integrating neighborhood and scal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15032v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15032v1-abstract-full" style="display: none;"> Open-pit mine change detection (CD) in high-resolution (HR) remote sensing images plays a crucial role in mineral development and environmental protection. Significant progress has been made in this field in recent years, largely due to the advancement of deep learning techniques. However, existing deep-learning-based CD methods encounter challenges in effectively integrating neighborhood and scale information, resulting in suboptimal performance. Therefore, by exploring the influence patterns of neighborhood and scale information, this paper proposes an Integrated Neighborhood and Scale Information Network (INSINet) for open-pit mine CD in HR remote sensing images. Specifically, INSINet introduces 8-neighborhood-image information to acquire a larger receptive field, improving the recognition of center image boundary regions. Drawing on techniques of skip connection, deep supervision, and attention mechanism, the multi-path deep supervised attention (MDSA) module is designed to enhance multi-scale information fusion and change feature extraction. Experimental analysis reveals that incorporating neighborhood and scale information enhances the F1 score of INSINet by 6.40%, with improvements of 3.08% and 3.32% respectively. INSINet outperforms existing methods with an Overall Accuracy of 97.69%, Intersection over Union of 71.26%, and F1 score of 83.22%. INSINet shows significance for open-pit mine CD in HR remote sensing images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15032v1-abstract-full').style.display = 'none'; document.getElementById('2403.15032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03383">arXiv:2402.03383</a> <span> [<a href="https://arxiv.org/pdf/2402.03383">pdf</a>, <a href="https://arxiv.org/format/2402.03383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MCU-Net: A Multi-prior Collaborative Deep Unfolding Network with Gates-controlled Spatial Attention for Accelerated MR Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaoyu Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weisheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guofen Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuping Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03383v3-abstract-short" style="display: inline;"> Deep unfolding networks (DUNs) have demonstrated significant potential in accelerating magnetic resonance imaging (MRI). However, they often encounter high computational costs and slow convergence rates. Besides, they struggle to fully exploit the complementarity when incorporating multiple priors. In this study, we propose a multi-prior collaborative DUN, termed MCU-Net, to address these limitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03383v3-abstract-full').style.display = 'inline'; document.getElementById('2402.03383v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03383v3-abstract-full" style="display: none;"> Deep unfolding networks (DUNs) have demonstrated significant potential in accelerating magnetic resonance imaging (MRI). However, they often encounter high computational costs and slow convergence rates. Besides, they struggle to fully exploit the complementarity when incorporating multiple priors. In this study, we propose a multi-prior collaborative DUN, termed MCU-Net, to address these limitations. Our method features a parallel structure consisting of different optimization-inspired subnetworks based on low-rank and sparsity, respectively. We design a gates-controlled spatial attention module (GSAM), evaluating the relative confidence (RC) and overall confidence (OC) maps for intermediate reconstructions produced by different subnetworks. RC allocates greater weights to the image regions where each subnetwork excels, enabling precise element-wise collaboration. We design correction modules to enhance the effectiveness in regions where both subnetworks exhibit limited performance, as indicated by low OC values, thereby obviating the need for additional branches. The gate units within GSAMs are designed to preserve necessary information across multiple iterations, improving the accuracy of the learned confidence maps and enhancing robustness against accumulated errors. Experimental results on multiple datasets show significant improvements on PSNR and SSIM results with relatively low FLOPs compared to cutting-edge methods. Additionally, the proposed strategy can be conveniently applied to various DUN structures to enhance their performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03383v3-abstract-full').style.display = 'none'; document.getElementById('2402.03383v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.17721">arXiv:2401.17721</a> <span> [<a href="https://arxiv.org/pdf/2401.17721">pdf</a>, <a href="https://arxiv.org/format/2401.17721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Time Synchronization for 5G and TSN Integrated Networking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixiao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zonghui Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xuan Qiao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yiming Zheng</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Bo Ai</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoyu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.17721v1-abstract-short" style="display: inline;"> Emerging industrial applications involving robotic collaborative operations and mobile robots require a more reliable and precise wireless network for deterministic data transmission. To meet this demand, the 3rd Generation Partnership Project (3GPP) is promoting the integration of 5th Generation Mobile Communication Technology (5G) and Time-Sensitive Networking (TSN). Time synchronization is esse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17721v1-abstract-full').style.display = 'inline'; document.getElementById('2401.17721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.17721v1-abstract-full" style="display: none;"> Emerging industrial applications involving robotic collaborative operations and mobile robots require a more reliable and precise wireless network for deterministic data transmission. To meet this demand, the 3rd Generation Partnership Project (3GPP) is promoting the integration of 5th Generation Mobile Communication Technology (5G) and Time-Sensitive Networking (TSN). Time synchronization is essential for deterministic data transmission. Based on the 3GPP's vision of the 5G and TSN integrated networking with interoperability, we improve the time synchronization of TSN to conquer the multi-gNB competition, re-transmission, and mobility problems for the integrated 5G time synchronization. We implemented the improvement mechanisms and systematically validated the performance of 5G+TSN time synchronization. Based on the simulation in 500m x 500m industrial environments, the improved time synchronization achieved a precision of 1 microsecond with interoperability between 5G nodes and TSN nodes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17721v1-abstract-full').style.display = 'none'; document.getElementById('2401.17721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10475">arXiv:2401.10475</a> <span> [<a href="https://arxiv.org/pdf/2401.10475">pdf</a>, <a href="https://arxiv.org/format/2401.10475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short Video Search Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiangshuo Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xianxin Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xiaozhe Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cihang Jin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10475v2-abstract-short" style="display: inline;"> Vision-Language Models pre-trained on large-scale image-text datasets have shown superior performance in downstream tasks such as image retrieval. Most of the images for pre-training are presented in the form of open domain common-sense visual elements. Differently, video covers in short video search scenarios are presented as user-originated contents that provide important visual summaries of vid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10475v2-abstract-full').style.display = 'inline'; document.getElementById('2401.10475v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10475v2-abstract-full" style="display: none;"> Vision-Language Models pre-trained on large-scale image-text datasets have shown superior performance in downstream tasks such as image retrieval. Most of the images for pre-training are presented in the form of open domain common-sense visual elements. Differently, video covers in short video search scenarios are presented as user-originated contents that provide important visual summaries of videos. In addition, a portion of the video covers come with manually designed cover texts that provide semantic complements. In order to fill in the gaps in short video cover data, we establish the first large-scale cover-text benchmark for Chinese short video search scenarios. Specifically, we release two large-scale datasets CBVS-5M/10M to provide short video covers, and the manual fine-labeling dataset CBVS-20K to provide real user queries, which serves as an image-text benchmark test in the Chinese short video search field. To integrate the semantics of cover text in the case of modality missing, we propose UniCLIP where cover texts play a guiding role during training, however are not relied upon by inference. Extensive evaluation on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has been deployed to Tencent's online video search systems with hundreds of millions of visits and achieved significant gains. The dataset and code are available at https://github.com/QQBrowserVideoSearch/CBVS-UniCLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10475v2-abstract-full').style.display = 'none'; document.getElementById('2401.10475v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09133">arXiv:2401.09133</a> <span> [<a href="https://arxiv.org/pdf/2401.09133">pdf</a>, <a href="https://arxiv.org/format/2401.09133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SM$^3$: Self-Supervised Multi-task Modeling with Multi-view 2D Images for Articulated Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haowen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhao Jin</a>, <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zhengping Che</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+L">Liang Qiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yakun Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhipeng Fan</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09133v1-abstract-short" style="display: inline;"> Reconstructing real-world objects and estimating their movable joint structures are pivotal technologies within the field of robotics. Previous research has predominantly focused on supervised approaches, relying on extensively annotated datasets to model articulated objects within limited categories. However, this approach falls short of effectively addressing the diversity present in the real wo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09133v1-abstract-full').style.display = 'inline'; document.getElementById('2401.09133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09133v1-abstract-full" style="display: none;"> Reconstructing real-world objects and estimating their movable joint structures are pivotal technologies within the field of robotics. Previous research has predominantly focused on supervised approaches, relying on extensively annotated datasets to model articulated objects within limited categories. However, this approach falls short of effectively addressing the diversity present in the real world. To tackle this issue, we propose a self-supervised interaction perception method, referred to as SM$^3$, which leverages multi-view RGB images captured before and after interaction to model articulated objects, identify the movable parts, and infer the parameters of their rotating joints. By constructing 3D geometries and textures from the captured 2D images, SM$^3$ achieves integrated optimization of movable part and joint parameters during the reconstruction process, obviating the need for annotations. Furthermore, we introduce the MMArt dataset, an extension of PartNet-Mobility, encompassing multi-view and multi-modal data of articulated objects spanning diverse categories. Evaluations demonstrate that SM$^3$ surpasses existing benchmarks across various categories and objects, while its adaptability in real-world scenarios has been thoroughly validated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09133v1-abstract-full').style.display = 'none'; document.getElementById('2401.09133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.05689">arXiv:2401.05689</a> <span> [<a href="https://arxiv.org/pdf/2401.05689">pdf</a>, <a href="https://arxiv.org/format/2401.05689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP49357.2023.10096194">10.1109/ICASSP49357.2023.10096194 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> UCorrect: An Unsupervised Framework for Automatic Speech Recognition Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaxin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minghan Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+H">Hengchao Shang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongyao Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhengzhe Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinglu Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+C">Chang Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Shimin Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.05689v1-abstract-short" style="display: inline;"> Error correction techniques have been used to refine the output sentences from automatic speech recognition (ASR) models and achieve a lower word error rate (WER). Previous works usually adopt end-to-end models and has strong dependency on Pseudo Paired Data and Original Paired Data. But when only pre-training on Pseudo Paired Data, previous models have negative effect on correction. While fine-tu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05689v1-abstract-full').style.display = 'inline'; document.getElementById('2401.05689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.05689v1-abstract-full" style="display: none;"> Error correction techniques have been used to refine the output sentences from automatic speech recognition (ASR) models and achieve a lower word error rate (WER). Previous works usually adopt end-to-end models and has strong dependency on Pseudo Paired Data and Original Paired Data. But when only pre-training on Pseudo Paired Data, previous models have negative effect on correction. While fine-tuning on Original Paired Data, the source side data must be transcribed by a well-trained ASR model, which takes a lot of time and not universal. In this paper, we propose UCorrect, an unsupervised Detector-Generator-Selector framework for ASR Error Correction. UCorrect has no dependency on the training data mentioned before. The whole procedure is first to detect whether the character is erroneous, then to generate some candidate characters and finally to select the most confident one to replace the error character. Experiments on the public AISHELL-1 dataset and WenetSpeech dataset show the effectiveness of UCorrect for ASR error correction: 1) it achieves significant WER reduction, achieves 6.83\% even without fine-tuning and 14.29\% after fine-tuning; 2) it outperforms the popular NAR correction models by a large margin with a competitive low latency; and 3) it is an universal method, as it reduces all WERs of the ASR model with different decoding strategies and reduces all WERs of ASR models trained on different scale datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05689v1-abstract-full').style.display = 'none'; document.getElementById('2401.05689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03329">arXiv:2401.03329</a> <span> [<a href="https://arxiv.org/pdf/2401.03329">pdf</a>, <a href="https://arxiv.org/format/2401.03329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-030-90525-5_38">10.1007/978-3-030-90525-5_38 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Designing a Socially Assistive Robot to Support Older Adults with Low Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+E">Emily Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhonghao Shi</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaoyang Qiao</a>, <a href="/search/cs?searchtype=author&query=Matari%C4%87%2C+M+J">Maja J Matari膰</a>, <a href="/search/cs?searchtype=author&query=Bittner%2C+A+K">Ava K Bittner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03329v1-abstract-short" style="display: inline;"> Socially assistive robots (SARs) have shown great promise in supplementing and augmenting interventions to support the physical and mental well-being of older adults. However, past work has not yet explored the potential of applying SAR to lower the barriers of long-term low vision rehabilitation (LVR) interventions for older adults. In this work, we present a user-informed design process to valid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03329v1-abstract-full').style.display = 'inline'; document.getElementById('2401.03329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03329v1-abstract-full" style="display: none;"> Socially assistive robots (SARs) have shown great promise in supplementing and augmenting interventions to support the physical and mental well-being of older adults. However, past work has not yet explored the potential of applying SAR to lower the barriers of long-term low vision rehabilitation (LVR) interventions for older adults. In this work, we present a user-informed design process to validate the motivation and identify major design principles for developing SAR for long-term LVR. To evaluate user-perceived usefulness and acceptance of SAR in this novel domain, we performed a two-phase study through user surveys. First, a group (n=38) of older adults with LV completed a mailed-in survey. Next, a new group (n=13) of older adults with LV saw an in-clinic SAR demo and then completed the survey. The study participants reported that SARs would be useful, trustworthy, easy to use, and enjoyable while providing socio-emotional support to augment LVR interventions. The in-clinic demo group reported significantly more positive opinions of the SAR's capabilities than did the baseline survey group that used mailed-in forms without the SAR demo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03329v1-abstract-full').style.display = 'none'; document.getElementById('2401.03329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Social Robotics: 13th International Conference, ICSR 2021. Springer International Publishing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.00982">arXiv:2311.00982</a> <span> [<a href="https://arxiv.org/pdf/2311.00982">pdf</a>, <a href="https://arxiv.org/ps/2311.00982">ps</a>, <a href="https://arxiv.org/format/2311.00982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> The c-differential properties of a class of power functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huan Zhou</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaoni Du</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wenping Yuan</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingbin Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.00982v1-abstract-short" style="display: inline;"> Power functions with low $c$-differential uniformity have been widely studied not only because of their strong resistance to multiplicative differential attacks, but also low implementation cost in hardware. Furthermore, the $c$-differential spectrum of a function gives a more precise characterization of its $c$-differential properties. Let $f(x)=x^{\frac{p^n+3}{2}}$ be a power function over the f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00982v1-abstract-full').style.display = 'inline'; document.getElementById('2311.00982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.00982v1-abstract-full" style="display: none;"> Power functions with low $c$-differential uniformity have been widely studied not only because of their strong resistance to multiplicative differential attacks, but also low implementation cost in hardware. Furthermore, the $c$-differential spectrum of a function gives a more precise characterization of its $c$-differential properties. Let $f(x)=x^{\frac{p^n+3}{2}}$ be a power function over the finite field $\mathbb{F}_{p^{n}}$, where $p\neq3$ is an odd prime and $n$ is a positive integer. In this paper, for all primes $p\neq3$, by investigating certain character sums with regard to elliptic curves and computing the number of solutions of a system of equations over $\mathbb{F}_{p^{n}}$, we determine explicitly the $(-1)$-differential spectrum of $f$ with a unified approach. We show that if $p^n \equiv 3 \pmod 4$, then $f$ is a differentially $(-1,3)$-uniform function except for $p^n\in\{7,19,23\}$ where $f$ is an APcN function, and if $p^n \equiv 1 \pmod 4$, the $(-1)$-differential uniformity of $f$ is equal to $4$. In addition, an upper bound of the $c$-differential uniformity of $f$ is also given. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00982v1-abstract-full').style.display = 'none'; document.getElementById('2311.00982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07159">arXiv:2310.07159</a> <span> [<a href="https://arxiv.org/pdf/2310.07159">pdf</a>, <a href="https://arxiv.org/format/2310.07159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> My Brother Helps Me: Node Injection Based Adversarial Attack on Social Bot Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lanjun Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinran Qiao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanwei Xie</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+W">Weizhi Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Anan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07159v1-abstract-short" style="display: inline;"> Social platforms such as Twitter are under siege from a multitude of fraudulent users. In response, social bot detection tasks have been developed to identify such fake users. Due to the structure of social networks, the majority of methods are based on the graph neural network(GNN), which is susceptible to attacks. In this study, we propose a node injection-based adversarial attack method designe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07159v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07159v1-abstract-full" style="display: none;"> Social platforms such as Twitter are under siege from a multitude of fraudulent users. In response, social bot detection tasks have been developed to identify such fake users. Due to the structure of social networks, the majority of methods are based on the graph neural network(GNN), which is susceptible to attacks. In this study, we propose a node injection-based adversarial attack method designed to deceive bot detection models. Notably, neither the target bot nor the newly injected bot can be detected when a new bot is added around the target bot. This attack operates in a black-box fashion, implying that any information related to the victim model remains unknown. To our knowledge, this is the first study exploring the resilience of bot detection through graph node injection. Furthermore, we develop an attribute recovery module to revert the injected node embedding from the graph embedding space back to the original feature space, enabling the adversary to manipulate node perturbation effectively. We conduct adversarial attacks on four commonly used GNN structures for bot detection on two widely used datasets: Cresci-2015 and TwiBot-22. The attack success rate is over 73\% and the rate of newly injected nodes being detected as bots is below 13\% on these two datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07159v1-abstract-full').style.display = 'none'; document.getElementById('2310.07159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09552">arXiv:2309.09552</a> <span> [<a href="https://arxiv.org/pdf/2309.09552">pdf</a>, <a href="https://arxiv.org/format/2309.09552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Multitask Training Approach to Enhance Whisper with Contextual Biasing and Open-Vocabulary Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+C">Chang Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinglu Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengxin Ren</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Miaomiao Ma</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Shimin Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09552v4-abstract-short" style="display: inline;"> The recognition of rare named entities, such as personal names and terminologies, is challenging for automatic speech recognition (ASR) systems, especially when they are not frequently observed in the training data. In this paper, we introduce keyword spotting enhanced Whisper (KWS-Whisper), a novel ASR system that leverages the Whisper model and performs open-vocabulary keyword spotting (OV-KWS)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09552v4-abstract-full').style.display = 'inline'; document.getElementById('2309.09552v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09552v4-abstract-full" style="display: none;"> The recognition of rare named entities, such as personal names and terminologies, is challenging for automatic speech recognition (ASR) systems, especially when they are not frequently observed in the training data. In this paper, we introduce keyword spotting enhanced Whisper (KWS-Whisper), a novel ASR system that leverages the Whisper model and performs open-vocabulary keyword spotting (OV-KWS) on the hidden states of the Whisper encoder to recognize user-defined named entities. These entities serve as prompts for the Whisper decoder. To optimize the model, we propose a multitask training approach that learns OV-KWS and contextual-ASR tasks. We evaluate our approach on Chinese Aishell hot word subsets and two internal code-switching test sets and show that it significantly improves the entity recall compared to the original Whisper model. Moreover, we demonstrate that the OV-KWS can be a plug-and-play module to enhance the ASR error correction methods and frozen Whisper models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09552v4-abstract-full').style.display = 'none'; document.getElementById('2309.09552v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, Accepted to InterSpeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13024">arXiv:2308.13024</a> <span> [<a href="https://arxiv.org/pdf/2308.13024">pdf</a>, <a href="https://arxiv.org/format/2308.13024">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EVM: Incorporating Model Checking into Exploratory Visual Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kale%2C+A">Alex Kale</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Ziyang Guo</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X+L">Xiao Li Qiao</a>, <a href="/search/cs?searchtype=author&query=Heer%2C+J">Jeffrey Heer</a>, <a href="/search/cs?searchtype=author&query=Hullman%2C+J">Jessica Hullman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13024v1-abstract-short" style="display: inline;"> Visual analytics (VA) tools support data exploration by helping analysts quickly and iteratively generate views of data which reveal interesting patterns. However, these tools seldom enable explicit checks of the resulting interpretations of data -- e.g., whether patterns can be accounted for by a model that implies a particular structure in the relationships between variables. We present EVM, a d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13024v1-abstract-full').style.display = 'inline'; document.getElementById('2308.13024v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13024v1-abstract-full" style="display: none;"> Visual analytics (VA) tools support data exploration by helping analysts quickly and iteratively generate views of data which reveal interesting patterns. However, these tools seldom enable explicit checks of the resulting interpretations of data -- e.g., whether patterns can be accounted for by a model that implies a particular structure in the relationships between variables. We present EVM, a data exploration tool that enables users to express and check provisional interpretations of data in the form of statistical models. EVM integrates support for visualization-based model checks by rendering distributions of model predictions alongside user-generated views of data. In a user study with data scientists practicing in the private and public sector, we evaluate how model checks influence analysts' thinking during data exploration. Our analysis characterizes how participants use model checks to scrutinize expectations about data generating process and surfaces further opportunities to scaffold model exploration in VA tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13024v1-abstract-full').style.display = 'none'; document.getElementById('2308.13024v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10822">arXiv:2308.10822</a> <span> [<a href="https://arxiv.org/pdf/2308.10822">pdf</a>, <a href="https://arxiv.org/format/2308.10822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Novel Ehanced Move Recognition Algorithm Based on Pre-trained Models with Positional Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+H">Hao Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaodong Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10822v2-abstract-short" style="display: inline;"> The recognition of abstracts is crucial for effectively locating the content and clarifying the article. Existing move recognition algorithms lack the ability to learn word position information to obtain contextual semantics. This paper proposes a novel enhanced move recognition algorithm with an improved pre-trained model and a gated network with attention mechanism for unstructured abstracts of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10822v2-abstract-full').style.display = 'inline'; document.getElementById('2308.10822v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10822v2-abstract-full" style="display: none;"> The recognition of abstracts is crucial for effectively locating the content and clarifying the article. Existing move recognition algorithms lack the ability to learn word position information to obtain contextual semantics. This paper proposes a novel enhanced move recognition algorithm with an improved pre-trained model and a gated network with attention mechanism for unstructured abstracts of Chinese scientific and technological papers. The proposed algorithm first performs summary data segmentation and vocabulary training. The EP-ERNIE$\_$AT-GRU framework is leveraged to incorporate word positional information, facilitating deep semantic learning and targeted feature extraction. Experimental results demonstrate that the proposed algorithm achieves 13.37$\%$ higher accuracy on the split dataset than on the original dataset and a 7.55$\%$ improvement in accuracy over the basic comparison model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10822v2-abstract-full').style.display = 'none'; document.getElementById('2308.10822v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02239">arXiv:2308.02239</a> <span> [<a href="https://arxiv.org/pdf/2308.02239">pdf</a>, <a href="https://arxiv.org/format/2308.02239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3612142">10.1145/3581783.3612142 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via Deformable Template Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haowen Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhipeng Fan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zhengping Che</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yakun Huang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02239v1-abstract-short" style="display: inline;"> Estimating 6D poses and reconstructing 3D shapes of objects in open-world scenes from RGB-depth image pairs is challenging. Many existing methods rely on learning geometric features that correspond to specific templates while disregarding shape variations and pose differences among objects in the same category. As a result, these methods underperform when handling unseen object instances in comple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02239v1-abstract-full').style.display = 'inline'; document.getElementById('2308.02239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02239v1-abstract-full" style="display: none;"> Estimating 6D poses and reconstructing 3D shapes of objects in open-world scenes from RGB-depth image pairs is challenging. Many existing methods rely on learning geometric features that correspond to specific templates while disregarding shape variations and pose differences among objects in the same category. As a result, these methods underperform when handling unseen object instances in complex environments. In contrast, other approaches aim to achieve category-level estimation and reconstruction by leveraging normalized geometric structure priors, but the static prior-based reconstruction struggles with substantial intra-class variations. To solve these problems, we propose the DTF-Net, a novel framework for pose estimation and shape reconstruction based on implicit neural fields of object categories. In DTF-Net, we design a deformable template field to represent the general category-wise shape latent features and intra-category geometric deformation features. The field establishes continuous shape correspondences, deforming the category template into arbitrary observed instances to accomplish shape reconstruction. We introduce a pose regression module that shares the deformation features and template codes from the fields to estimate the accurate 6D pose of each object in the scene. We integrate a multi-modal representation extraction module to extract object features and semantic masks, enabling end-to-end inference. Moreover, during training, we implement a shape-invariant training strategy and a viewpoint sampling method to further enhance the model's capability to extract object pose features. Extensive experiments on the REAL275 and CAMERA25 datasets demonstrate the superiority of DTF-Net in both synthetic and real scenes. Furthermore, we show that DTF-Net effectively supports grasping tasks with a real robot arm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02239v1-abstract-full').style.display = 'none'; document.getElementById('2308.02239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors are with equal contributions. Paper accepted by ACM MM 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.16848">arXiv:2307.16848</a> <span> [<a href="https://arxiv.org/pdf/2307.16848">pdf</a>, <a href="https://arxiv.org/format/2307.16848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-aware Gaussian Mixture Model for UWB Time Difference of Arrival Localization in Cluttered Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenda Zhao</a>, <a href="/search/cs?searchtype=author&query=Goudar%2C+A">Abhishek Goudar</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Mingliang Tang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinyuan Qiao</a>, <a href="/search/cs?searchtype=author&query=Schoellig%2C+A+P">Angela P. Schoellig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.16848v1-abstract-short" style="display: inline;"> Ultra-wideband (UWB) time difference of arrival(TDOA)-based localization has emerged as a low-cost and scalable indoor positioning solution. However, in cluttered environments, the performance of UWB TDOA-based localization deteriorates due to the biased and non-Gaussian noise distributions induced by obstacles. In this work, we present a bi-level optimization-based joint localization and noise mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16848v1-abstract-full').style.display = 'inline'; document.getElementById('2307.16848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.16848v1-abstract-full" style="display: none;"> Ultra-wideband (UWB) time difference of arrival(TDOA)-based localization has emerged as a low-cost and scalable indoor positioning solution. However, in cluttered environments, the performance of UWB TDOA-based localization deteriorates due to the biased and non-Gaussian noise distributions induced by obstacles. In this work, we present a bi-level optimization-based joint localization and noise model learning algorithm to address this problem. In particular, we use a Gaussian mixture model (GMM) to approximate the measurement noise distribution. We explicitly incorporate the estimated state's uncertainty into the GMM noise model learning, referred to as uncertainty-aware GMM, to improve both noise modeling and localization performance. We first evaluate the GMM noise model learning and localization performance in numerous simulation scenarios. We then demonstrate the effectiveness of our algorithm in extensive real-world experiments using two different cluttered environments. We show that our algorithm provides accurate position estimates with low-cost UWB sensors, no prior knowledge about the obstacles in the space, and a significant amount of UWB radios occluded. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16848v1-abstract-full').style.display = 'none'; document.getElementById('2307.16848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.15266">arXiv:2306.15266</a> <span> [<a href="https://arxiv.org/pdf/2306.15266">pdf</a>, <a href="https://arxiv.org/format/2306.15266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalized Out-of-distribution Fault Diagnosis (GOOFD) via Internal Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingyue Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanrong Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinlong Qiao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Ke Ma</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Shuting Tao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Peng Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.15266v2-abstract-short" style="display: inline;"> Fault diagnosis is crucial in monitoring machines within industrial processes. With the increasing complexity of working conditions and demand for safety during production, diverse diagnosis methods are required, and an integrated fault diagnosis system capable of handling multiple tasks is highly desired. However, the diagnosis subtasks are often studied separately, and the current methods still… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15266v2-abstract-full').style.display = 'inline'; document.getElementById('2306.15266v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.15266v2-abstract-full" style="display: none;"> Fault diagnosis is crucial in monitoring machines within industrial processes. With the increasing complexity of working conditions and demand for safety during production, diverse diagnosis methods are required, and an integrated fault diagnosis system capable of handling multiple tasks is highly desired. However, the diagnosis subtasks are often studied separately, and the current methods still need improvement for such a generalized system. To address this issue, we propose the Generalized Out-of-distribution Fault Diagnosis (GOOFD) framework to integrate diagnosis subtasks. Additionally, a unified fault diagnosis method based on internal contrastive learning and Mahalanobis distance is put forward to underpin the proposed generalized framework. The method involves feature extraction through internal contrastive learning and outlier recognition based on the Mahalanobis distance. Our proposed method can be applied to multiple faults diagnosis tasks and achieve better performance than the existing single-task methods. Experiments are conducted on benchmark and practical process datasets, indicating the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15266v2-abstract-full').style.display = 'none'; document.getElementById('2306.15266v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03584">arXiv:2306.03584</a> <span> [<a href="https://arxiv.org/pdf/2306.03584">pdf</a>, <a href="https://arxiv.org/format/2306.03584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2024.3388004">10.1109/TPAMI.2024.3388004 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haowen Wang</a>, <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zhengping Che</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yufan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengshi Qi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03584v2-abstract-short" style="display: inline;"> Raw depth images captured in indoor scenarios frequently exhibit extensive missing values due to the inherent limitations of the sensors and environments. For example, transparent materials frequently elude detection by depth sensors; surfaces may introduce measurement inaccuracies due to their polished textures, extended distances, and oblique incidence angles from the sensor. The presence of inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03584v2-abstract-full').style.display = 'inline'; document.getElementById('2306.03584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03584v2-abstract-full" style="display: none;"> Raw depth images captured in indoor scenarios frequently exhibit extensive missing values due to the inherent limitations of the sensors and environments. For example, transparent materials frequently elude detection by depth sensors; surfaces may introduce measurement inaccuracies due to their polished textures, extended distances, and oblique incidence angles from the sensor. The presence of incomplete depth maps imposes significant challenges for subsequent vision applications, prompting the development of numerous depth completion techniques to mitigate this problem. Numerous methods excel at reconstructing dense depth maps from sparse samples, but they often falter when faced with extensive contiguous regions of missing depth values, a prevalent and critical challenge in indoor environments. To overcome these challenges, we design a novel two-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB and incomplete depth images as input to predict a dense and completed depth map. The first branch employs an encoder-decoder structure, by adhering to the Manhattan world assumption and utilizing normal maps from RGB-D information as guidance, to regress the local dense depth values from the raw depth map. The other branch applies an RGB-depth fusion CycleGAN, adept at translating RGB imagery into detailed, textured depth maps while ensuring high fidelity through cycle consistency. We fuse the two branches via adaptive fusion modules named W-AdaIN and train the model with the help of pseudo depth maps. Comprehensive evaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method significantly enhances depth completion performance particularly in realistic indoor settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03584v2-abstract-full').style.display = 'none'; document.getElementById('2306.03584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Haowen Wang and Zhengping Che are with equal contributions. Paper accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI). An earlier version has been accepted by CVPR 2022 (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.15424">arXiv:2305.15424</a> <span> [<a href="https://arxiv.org/pdf/2305.15424">pdf</a>, <a href="https://arxiv.org/format/2305.15424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PulseNet: Deep Learning ECG-signal classification using random augmentation policy and continous wavelet transform for canines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dourson%2C+A">Andre Dourson</a>, <a href="/search/cs?searchtype=author&query=Santilli%2C+R">Roberto Santilli</a>, <a href="/search/cs?searchtype=author&query=Marchesotti%2C+F">Federica Marchesotti</a>, <a href="/search/cs?searchtype=author&query=Schneiderman%2C+J">Jennifer Schneiderman</a>, <a href="/search/cs?searchtype=author&query=Stiel%2C+O+R">Oliver Roman Stiel</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+F">Fernando Junior</a>, <a href="/search/cs?searchtype=author&query=Fitzke%2C+M">Michael Fitzke</a>, <a href="/search/cs?searchtype=author&query=Sithirangathan%2C+N">Norbert Sithirangathan</a>, <a href="/search/cs?searchtype=author&query=Walleser%2C+E">Emil Walleser</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaoli Qiao</a>, <a href="/search/cs?searchtype=author&query=Parkinson%2C+M">Mark Parkinson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.15424v2-abstract-short" style="display: inline;"> Evaluating canine electrocardiograms (ECG) require skilled veterinarians, but current availability of veterinary cardiologists for ECG interpretation and diagnostic support is limited. Developing tools for automated assessment of ECG sequences can improve veterinary care by providing clinicians real-time results and decision support tools. We implement a deep convolutional neural network (CNN) app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15424v2-abstract-full').style.display = 'inline'; document.getElementById('2305.15424v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.15424v2-abstract-full" style="display: none;"> Evaluating canine electrocardiograms (ECG) require skilled veterinarians, but current availability of veterinary cardiologists for ECG interpretation and diagnostic support is limited. Developing tools for automated assessment of ECG sequences can improve veterinary care by providing clinicians real-time results and decision support tools. We implement a deep convolutional neural network (CNN) approach for classifying canine electrocardiogram sequences as either normal or abnormal. ECG records are converted into 8 second Lead II sequences and classified as either normal (no evidence of cardiac abnormalities) or abnormal (presence of one or more cardiac abnormalities). For training ECG sequences are randomly augmented using RandomAugmentECG, a new augmentation library implemented specifically for this project. Each chunk is then is converted using a continuous wavelet transform into a 2D scalogram. The 2D scalogram are then classified as either normal or abnormal by a binary CNN classifier. Experimental results are validated against three boarded veterinary cardiologists achieving an AUC-ROC score of 0.9506 on test dataset matching human level performance. Additionally, we describe model deployment to Microsoft Azure using an MLOps approach. To our knowledge, this work is one of the first attempts to implement a deep learning model to automatically classify ECG sequences for canines.Implementing automated ECG classification will enhance veterinary care through improved diagnostic performance and increased clinic efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15424v2-abstract-full').style.display = 'none'; document.getElementById('2305.15424v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14543">arXiv:2305.14543</a> <span> [<a href="https://arxiv.org/pdf/2305.14543">pdf</a>, <a href="https://arxiv.org/format/2305.14543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Functional Factor Models: Forecasting High-Dimensional Functional Time Series via Bayesian Nonparametric Factorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yirui Liu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinghao Qiao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Y">Yulong Pei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liying Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14543v2-abstract-short" style="display: inline;"> This paper introduces the Deep Functional Factor Model (DF2M), a Bayesian nonparametric model designed for analysis of high-dimensional functional time series. DF2M is built upon the Indian Buffet Process and the multi-task Gaussian Process, incorporating a deep kernel function that captures non-Markovian and nonlinear temporal dynamics. Unlike many black-box deep learning models, DF2M offers an e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14543v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14543v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14543v2-abstract-full" style="display: none;"> This paper introduces the Deep Functional Factor Model (DF2M), a Bayesian nonparametric model designed for analysis of high-dimensional functional time series. DF2M is built upon the Indian Buffet Process and the multi-task Gaussian Process, incorporating a deep kernel function that captures non-Markovian and nonlinear temporal dynamics. Unlike many black-box deep learning models, DF2M offers an explainable approach to utilizing neural networks by constructing a factor model and integrating deep neural networks within the kernel function. Additionally, we develop a computationally efficient variational inference algorithm to infer DF2M. Empirical results from four real-world datasets demonstrate that DF2M provides better explainability and superior predictive accuracy compared to conventional deep learning models for high-dimensional functional time series. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14543v2-abstract-full').style.display = 'none'; document.getElementById('2305.14543v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09307">arXiv:2303.09307</a> <span> [<a href="https://arxiv.org/pdf/2303.09307">pdf</a>, <a href="https://arxiv.org/format/2303.09307">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Depth Super-Resolution from Explicit and Implicit High-Frequency Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xin Qiao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanhui Zhou</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09307v2-abstract-short" style="display: inline;"> We propose a novel multi-stage depth super-resolution network, which progressively reconstructs high-resolution depth maps from explicit and implicit high-frequency features. The former are extracted by an efficient transformer processing both local and global contexts, while the latter are obtained by projecting color images into the frequency domain. Both are combined together with depth feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09307v2-abstract-full').style.display = 'inline'; document.getElementById('2303.09307v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09307v2-abstract-full" style="display: none;"> We propose a novel multi-stage depth super-resolution network, which progressively reconstructs high-resolution depth maps from explicit and implicit high-frequency features. The former are extracted by an efficient transformer processing both local and global contexts, while the latter are obtained by projecting color images into the frequency domain. Both are combined together with depth features by means of a fusion strategy within a multi-stage and multi-scale framework. Experiments on the main benchmarks, such as NYUv2, Middlebury, DIML and RGBDD, show that our approach outperforms existing methods by a large margin (~20% on NYUv2 and DIML against the contemporary work DADA, with 16x upsampling), establishing a new state-of-the-art in the guided depth super-resolution task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09307v2-abstract-full').style.display = 'none'; document.getElementById('2303.09307v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.06808">arXiv:2210.06808</a> <span> [<a href="https://arxiv.org/pdf/2210.06808">pdf</a>, <a href="https://arxiv.org/format/2210.06808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ISCom: Interest-aware Semantic Communication Scheme for Point Cloud Video Streaming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yakun Huang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+B">Boyuan Bai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xiang Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.06808v1-abstract-short" style="display: inline;"> The provisioning of immersive point cloud video (PCV) streaming on pervasive mobile devices is a cornerstone for enabling immersive communication and interactions in the future 6G metaverse era. However, most streaming techniques are dedicated to efficient PCV compression and codec extending from traditional 3-DoF video services. Some emerging AI-enabled approaches are still in their infancy phase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06808v1-abstract-full').style.display = 'inline'; document.getElementById('2210.06808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.06808v1-abstract-full" style="display: none;"> The provisioning of immersive point cloud video (PCV) streaming on pervasive mobile devices is a cornerstone for enabling immersive communication and interactions in the future 6G metaverse era. However, most streaming techniques are dedicated to efficient PCV compression and codec extending from traditional 3-DoF video services. Some emerging AI-enabled approaches are still in their infancy phase and are constrained by intensive computational and adaptive flow techniques. In this paper, we present ISCom, an Interest-aware Semantic Communication Scheme for PCV, consisting of a region-of-interest (ROI) selection module, a lightweight PCV streaming module, and an intelligent scheduler. First, we propose a two-stage efficient ROI selection method for providing interest-aware PCV streaming, which significantly reduces the data volume. Second, we design a lightweight PCV encoder-decoder network for resource-constrained devices, adapting to the heterogeneous computing capabilities of terminals. Third, we train a deep reinforcement learning (DRL)-based scheduler to adapt an optimal encoder-decoder network for various devices, considering the dynamic network environments and computing capabilities of different devices. Extensive experiments show that ISCom outperforms baselines on mobile devices at least 10 FPS and up to 22 FPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06808v1-abstract-full').style.display = 'none'; document.getElementById('2210.06808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.06802">arXiv:2210.06802</a> <span> [<a href="https://arxiv.org/pdf/2210.06802">pdf</a>, <a href="https://arxiv.org/format/2210.06802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multi-Player Immersive Communications and Interactions in Metaverse: Challenges, Architecture, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yakun Huang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haowen Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xiang Su</a>, <a href="/search/cs?searchtype=author&query=Dustdar%2C+S">Schahram Dustdar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.06802v1-abstract-short" style="display: inline;"> The metaverse has awakened users' expectations of an immersive interaction that fuses the virtual digital world and the physical world across space and time. However, the metaverse is still in its infancy, typically expanding multi-player applications (e.g., multi-player games) to implement a prototype with the help of 5G/Beyond 5G, Artificial Intelligence, digital twin, and other enabling technol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06802v1-abstract-full').style.display = 'inline'; document.getElementById('2210.06802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.06802v1-abstract-full" style="display: none;"> The metaverse has awakened users' expectations of an immersive interaction that fuses the virtual digital world and the physical world across space and time. However, the metaverse is still in its infancy, typically expanding multi-player applications (e.g., multi-player games) to implement a prototype with the help of 5G/Beyond 5G, Artificial Intelligence, digital twin, and other enabling technologies. This article reviews the characteristics, key enabling technologies, and driving applications of the state-of-the-art metaverse. We focus on the immersive interactions perspective of the metaverse from the tasks, inputs, and feedback across the users, digital world, and physical world and reveal the key challenges. Afterwards, we present a multi-player interaction prototype platform based on a cloud-edge-device collaborative framework. Also, we evaluate it with centralized and device-to-device (D2D) approaches to verify the efficiency and flexibility of interactions. Finally, we point out future research approaches and discuss potential solutions to enable more stable and higher quality multi-player interactions for metaverse services. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06802v1-abstract-full').style.display = 'none'; document.getElementById('2210.06802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.06794">arXiv:2210.06794</a> <span> [<a href="https://arxiv.org/pdf/2210.06794">pdf</a>, <a href="https://arxiv.org/format/2210.06794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Towards Holographic Video Communications: A Promising AI-driven Solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yakun Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xiang Su</a>, <a href="/search/cs?searchtype=author&query=Dustdar%2C+S">Schahram Dustdar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.06794v1-abstract-short" style="display: inline;"> Real-time holographic video communications enable immersive experiences for next-generation video services in the future metaverse era. However, high-fidelity holographic videos require high bandwidth and significant computation resources, which exceed the transferring and computing capacity of 5G networks. This article reviews state-of-the-art holographic point cloud video (PCV) transmission tech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06794v1-abstract-full').style.display = 'inline'; document.getElementById('2210.06794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.06794v1-abstract-full" style="display: none;"> Real-time holographic video communications enable immersive experiences for next-generation video services in the future metaverse era. However, high-fidelity holographic videos require high bandwidth and significant computation resources, which exceed the transferring and computing capacity of 5G networks. This article reviews state-of-the-art holographic point cloud video (PCV) transmission techniques and highlights the critical challenges of delivering such immersive services. We further implement a preliminary prototype of an AI-driven holographic video communication system and present critical experimental results to evaluate its performance. Finally, we identify future research directions and discuss potential solutions for providing real-time and high-quality holographic experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06794v1-abstract-full').style.display = 'none'; document.getElementById('2210.06794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.13866">arXiv:2209.13866</a> <span> [<a href="https://arxiv.org/pdf/2209.13866">pdf</a>, <a href="https://arxiv.org/format/2209.13866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Blur Synthesis for Deep Real-World Image Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xin Qiao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pengchao Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.13866v1-abstract-short" style="display: inline;"> In this paper, we examine the problem of real-world image deblurring and take into account two key factors for improving the performance of the deep image deblurring model, namely, training data synthesis and network architecture design. Deblurring models trained on existing synthetic datasets perform poorly on real blurry images due to domain shift. To reduce the domain gap between synthetic and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13866v1-abstract-full').style.display = 'inline'; document.getElementById('2209.13866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.13866v1-abstract-full" style="display: none;"> In this paper, we examine the problem of real-world image deblurring and take into account two key factors for improving the performance of the deep image deblurring model, namely, training data synthesis and network architecture design. Deblurring models trained on existing synthetic datasets perform poorly on real blurry images due to domain shift. To reduce the domain gap between synthetic and real domains, we propose a novel realistic blur synthesis pipeline to simulate the camera imaging process. As a result of our proposed synthesis method, existing deblurring models could be made more robust to handle real-world blur. Furthermore, we develop an effective deblurring model that captures non-local dependencies and local context in the feature domain simultaneously. Specifically, we introduce the multi-path transformer module to UNet architecture for enriched multi-scale features learning. A comprehensive experiment on three real-world datasets shows that the proposed deblurring model performs better than state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13866v1-abstract-full').style.display = 'none'; document.getElementById('2209.13866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.09963">arXiv:2209.09963</a> <span> [<a href="https://arxiv.org/pdf/2209.09963">pdf</a>, <a href="https://arxiv.org/format/2209.09963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Acceptance Regions for Many Classes with Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhou Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingye Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.09963v1-abstract-short" style="display: inline;"> Set-valued classification, a new classification paradigm that aims to identify all the plausible classes that an observation belongs to, can be obtained by learning the acceptance regions for all classes. Many existing set-valued classification methods do not consider the possibility that a new class that never appeared in the training data appears in the test data. Moreover, they are computationa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.09963v1-abstract-full').style.display = 'inline'; document.getElementById('2209.09963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.09963v1-abstract-full" style="display: none;"> Set-valued classification, a new classification paradigm that aims to identify all the plausible classes that an observation belongs to, can be obtained by learning the acceptance regions for all classes. Many existing set-valued classification methods do not consider the possibility that a new class that never appeared in the training data appears in the test data. Moreover, they are computationally expensive when the number of classes is large. We propose a Generalized Prediction Set (GPS) approach to estimate the acceptance regions while considering the possibility of a new class in the test data. The proposed classifier minimizes the expected size of the prediction set while guaranteeing that the class-specific accuracy is at least a pre-specified value. Unlike previous methods, the proposed method achieves a good balance between accuracy, efficiency, and anomaly detection rate. Moreover, our method can be applied in parallel to all the classes to alleviate the computational burden. Both theoretical analysis and numerical experiments are conducted to illustrate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.09963v1-abstract-full').style.display = 'none'; document.getElementById('2209.09963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.12789">arXiv:2208.12789</a> <span> [<a href="https://arxiv.org/pdf/2208.12789">pdf</a>, <a href="https://arxiv.org/format/2208.12789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning and Compositionality: a Unification Attempt via Connectionist Probabilistic Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Ximing Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.12789v1-abstract-short" style="display: inline;"> We consider learning and compositionality as the key mechanisms towards simulating human-like intelligence. While each mechanism is successfully achieved by neural networks and symbolic AIs, respectively, it is the combination of the two mechanisms that makes human-like intelligence possible. Despite the numerous attempts on building hybrid neuralsymbolic systems, we argue that our true goal shoul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.12789v1-abstract-full').style.display = 'inline'; document.getElementById('2208.12789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.12789v1-abstract-full" style="display: none;"> We consider learning and compositionality as the key mechanisms towards simulating human-like intelligence. While each mechanism is successfully achieved by neural networks and symbolic AIs, respectively, it is the combination of the two mechanisms that makes human-like intelligence possible. Despite the numerous attempts on building hybrid neuralsymbolic systems, we argue that our true goal should be unifying learning and compositionality, the core mechanisms, instead of neural and symbolic methods, the surface approaches to achieve them. In this work, we review and analyze the strengths and weaknesses of neural and symbolic methods by separating their forms and meanings (structures and semantics), and propose Connectionist Probabilistic Program (CPPs), a framework that connects connectionist structures (for learning) and probabilistic program semantics (for compositionality). Under the framework, we design a CPP extension for small scale sequence modeling and provide a learning algorithm based on Bayesian inference. Although challenges exist in learning complex patterns without supervision, our early results demonstrate CPP's successful extraction of concepts and relations from raw sequential data, an initial step towards compositional learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.12789v1-abstract-full').style.display = 'none'; document.getElementById('2208.12789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.06322">arXiv:2208.06322</a> <span> [<a href="https://arxiv.org/pdf/2208.06322">pdf</a>, <a href="https://arxiv.org/format/2208.06322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EEGNN: Edge Enhanced Graph Neural Network with a Bayesian Nonparametric Graph Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yirui Liu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinghao Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liying Wang</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+J">Jessica Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.06322v2-abstract-short" style="display: inline;"> Training deep graph neural networks (GNNs) poses a challenging task, as the performance of GNNs may suffer from the number of hidden message-passing layers. The literature has focused on the proposals of {over-smoothing} and {under-reaching} to explain the performance deterioration of deep GNNs. In this paper, we propose a new explanation for such deteriorated performance phenomenon, {mis-simplifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.06322v2-abstract-full').style.display = 'inline'; document.getElementById('2208.06322v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.06322v2-abstract-full" style="display: none;"> Training deep graph neural networks (GNNs) poses a challenging task, as the performance of GNNs may suffer from the number of hidden message-passing layers. The literature has focused on the proposals of {over-smoothing} and {under-reaching} to explain the performance deterioration of deep GNNs. In this paper, we propose a new explanation for such deteriorated performance phenomenon, {mis-simplification}, that is, mistakenly simplifying graphs by preventing self-loops and forcing edges to be unweighted. We show that such simplifying can reduce the potential of message-passing layers to capture the structural information of graphs. In view of this, we propose a new framework, edge enhanced graph neural network (EEGNN). EEGNN uses the structural information extracted from the proposed Dirichlet mixture Poisson graph model (DMPGM), a Bayesian nonparametric model for graphs, to improve the performance of various deep message-passing GNNs. We propose a Markov chain Monte Carlo inference framework for DMPGM. Experiments over different datasets show that our method achieves considerable performance increase compared to baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.06322v2-abstract-full').style.display = 'none'; document.getElementById('2208.06322v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.06192">arXiv:2204.06192</a> <span> [<a href="https://arxiv.org/pdf/2204.06192">pdf</a>, <a href="https://arxiv.org/format/2204.06192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> 6G-enabled Edge AI for Metaverse: Challenges, Methods, and Future Research Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+L">Luyi Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pei Li</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+S">Shan Xi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wei Guo</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yukang Shen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.06192v1-abstract-short" style="display: inline;"> 6G-enabled edge intelligence opens up a new era of Internet of Everything and makes it possible to interconnect people-devices-cloud anytime, anywhere. More and more next-generation wireless network smart service applications are changing our way of life and improving our quality of life. As the hottest new form of next-generation Internet applications, Metaverse is striving to connect billions of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.06192v1-abstract-full').style.display = 'inline'; document.getElementById('2204.06192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.06192v1-abstract-full" style="display: none;"> 6G-enabled edge intelligence opens up a new era of Internet of Everything and makes it possible to interconnect people-devices-cloud anytime, anywhere. More and more next-generation wireless network smart service applications are changing our way of life and improving our quality of life. As the hottest new form of next-generation Internet applications, Metaverse is striving to connect billions of users and create a shared world where virtual and reality merge. However, limited by resources, computing power, and sensory devices, Metaverse is still far from realizing its full vision of immersion, materialization, and interoperability. To this end, this survey aims to realize this vision through the organic integration of 6G-enabled edge AI and Metaverse. Specifically, we first introduce three new types of edge-Metaverse architectures that use 6G-enabled edge AI to solve resource and computing constraints in Metaverse. Then we summarize technical challenges that these architectures face in Metaverse and the existing solutions. Furthermore, we explore how the edge-Metaverse architecture technology helps Metaverse to interact and share digital data. Finally, we discuss future research directions to realize the true vision of Metaverse with 6G-enabled edge AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.06192v1-abstract-full').style.display = 'none'; document.getElementById('2204.06192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14471">arXiv:2203.14471</a> <span> [<a href="https://arxiv.org/pdf/2203.14471">pdf</a>, <a href="https://arxiv.org/format/2203.14471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> UTIL: An Ultra-wideband Time-difference-of-arrival Indoor Localization Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenda Zhao</a>, <a href="/search/cs?searchtype=author&query=Goudar%2C+A">Abhishek Goudar</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xinyuan Qiao</a>, <a href="/search/cs?searchtype=author&query=Schoellig%2C+A+P">Angela P. Schoellig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14471v6-abstract-short" style="display: inline;"> Ultra-wideband (UWB) time-difference-of-arrival (TDOA)-based localization has emerged as a promising, low-cost, and scalable indoor localization solution, which is especially suited for multi-robot applications. However, there is a lack of public datasets to study and benchmark UWB TDOA positioning technology in cluttered indoor environments. We fill in this gap by presenting a comprehensive datas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14471v6-abstract-full').style.display = 'inline'; document.getElementById('2203.14471v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14471v6-abstract-full" style="display: none;"> Ultra-wideband (UWB) time-difference-of-arrival (TDOA)-based localization has emerged as a promising, low-cost, and scalable indoor localization solution, which is especially suited for multi-robot applications. However, there is a lack of public datasets to study and benchmark UWB TDOA positioning technology in cluttered indoor environments. We fill in this gap by presenting a comprehensive dataset using Decawave's DWM1000 UWB modules. To characterize the UWB TDOA measurement performance under various line-of-sight (LOS) and non-line-of-sight (NLOS) conditions, we collected signal-to-noise ratio (SNR), power difference values, and raw UWB TDOA measurements during the identification experiments. We also conducted a cumulative total of around 150 minutes of real-world flight experiments on a customized quadrotor platform to benchmark the UWB TDOA localization performance for mobile robots. The quadrotor was commanded to fly with an average speed of 0.45 m/s in both obstacle-free and cluttered environments using four different UWB anchor constellations. Raw sensor data including UWB TDOA, inertial measurement unit (IMU), optical flow, time-of-flight (ToF) laser altitude, and millimeter-accurate ground truth robot poses were collected during the flights. The dataset and development kit are available at https://utiasdsl.github.io/util-uwb-dataset/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14471v6-abstract-full').style.display = 'none'; document.getElementById('2203.14471v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.10856">arXiv:2203.10856</a> <span> [<a href="https://arxiv.org/pdf/2203.10856">pdf</a>, <a href="https://arxiv.org/format/2203.10856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RGB-Depth Fusion GAN for Indoor Depth Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haowen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zhengping Che</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiuquan Qiao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengshi Qi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.10856v1-abstract-short" style="display: inline;"> The raw depth image captured by the indoor depth sensor usually has an extensive range of missing depth values due to inherent limitations such as the inability to perceive transparent objects and limited distance range. The incomplete depth map burdens many downstream vision tasks, and a rising number of depth completion methods have been proposed to alleviate this issue. While most existing meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10856v1-abstract-full').style.display = 'inline'; document.getElementById('2203.10856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.10856v1-abstract-full" style="display: none;"> The raw depth image captured by the indoor depth sensor usually has an extensive range of missing depth values due to inherent limitations such as the inability to perceive transparent objects and limited distance range. The incomplete depth map burdens many downstream vision tasks, and a rising number of depth completion methods have been proposed to alleviate this issue. While most existing methods can generate accurate dense depth maps from sparse and uniformly sampled depth maps, they are not suitable for complementing the large contiguous regions of missing depth values, which is common and critical. In this paper, we design a novel two-branch end-to-end fusion network, which takes a pair of RGB and incomplete depth images as input to predict a dense and completed depth map. The first branch employs an encoder-decoder structure to regress the local dense depth values from the raw depth map, with the help of local guidance information extracted from the RGB image. In the other branch, we propose an RGB-depth fusion GAN to transfer the RGB image to the fine-grained textured depth map. We adopt adaptive fusion modules named W-AdaIN to propagate the features across the two branches, and we append a confidence fusion head to fuse the two outputs of the branches for the final depth map. Extensive experiments on NYU-Depth V2 and SUN RGB-D demonstrate that our proposed method clearly improves the depth completion performance, especially in a more realistic setting of indoor environments with the help of the pseudo depth map. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10856v1-abstract-full').style.display = 'none'; document.getElementById('2203.10856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.00781">arXiv:2203.00781</a> <span> [<a href="https://arxiv.org/pdf/2203.00781">pdf</a>, <a href="https://arxiv.org/format/2203.00781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Nearest Neighbor Classification for Crowdsourcing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jiexin Duan</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xingye Qiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.00781v1-abstract-short" style="display: inline;"> In machine learning, crowdsourcing is an economical way to label a large amount of data. However, the noise in the produced labels may deteriorate the accuracy of any classification method applied to the labelled data. We propose an enhanced nearest neighbor classifier (ENN) to overcome this issue. Two algorithms are developed to estimate the worker quality (which is often unknown in practice): on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.00781v1-abstract-full').style.display = 'inline'; document.getElementById('2203.00781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.00781v1-abstract-full" style="display: none;"> In machine learning, crowdsourcing is an economical way to label a large amount of data. However, the noise in the produced labels may deteriorate the accuracy of any classification method applied to the labelled data. We propose an enhanced nearest neighbor classifier (ENN) to overcome this issue. Two algorithms are developed to estimate the worker quality (which is often unknown in practice): one is to construct the estimate based on the denoised worker labels by applying the $k$NN classifier to the expert data; the other is an iterative algorithm that works even without access to the expert data. Other than strong numerical evidence, our proposed methods are proven to achieve the same regret as its oracle version based on high-quality expert data. As a technical by-product, a lower bound on the sample size assigned to each worker to reach the optimal convergence rate of regret is derived. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.00781v1-abstract-full').style.display = 'none'; document.getElementById('2203.00781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06937">arXiv:2106.06937</a> <span> [<a href="https://arxiv.org/pdf/2106.06937">pdf</a>, <a href="https://arxiv.org/format/2106.06937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Common Sense Beyond English: Evaluating and Improving Multilingual Language Models for Commonsense Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seyeon Lee</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaoyang Qiao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiang Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06937v1-abstract-short" style="display: inline;"> Commonsense reasoning research has so far been limited to English. We aim to evaluate and improve popular multilingual language models (ML-LMs) to help advance commonsense reasoning (CSR) beyond English. We collect the Mickey Corpus, consisting of 561k sentences in 11 different languages, which can be used for analyzing and improving ML-LMs. We propose Mickey Probe, a language-agnostic probing tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06937v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06937v1-abstract-full" style="display: none;"> Commonsense reasoning research has so far been limited to English. We aim to evaluate and improve popular multilingual language models (ML-LMs) to help advance commonsense reasoning (CSR) beyond English. We collect the Mickey Corpus, consisting of 561k sentences in 11 different languages, which can be used for analyzing and improving ML-LMs. We propose Mickey Probe, a language-agnostic probing task for fairly evaluating the common sense of popular ML-LMs across different languages. In addition, we also create two new datasets, X-CSQA and X-CODAH, by translating their English versions to 15 other languages, so that we can evaluate popular ML-LMs for cross-lingual commonsense reasoning. To improve the performance beyond English, we propose a simple yet effective method -- multilingual contrastive pre-training (MCP). It significantly enhances sentence representations, yielding a large performance gain on both benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06937v1-abstract-full').style.display = 'none'; document.getElementById('2106.06937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL-IJCNLP 2021 (long paper at main conference). Project website: https://inklab.usc.edu/XCSR/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.15256">arXiv:2103.15256</a> <span> [<a href="https://arxiv.org/pdf/2103.15256">pdf</a>, <a href="https://arxiv.org/format/2103.15256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Personalized Affect-Aware Socially Assistive Robot Tutors Aimed at Fostering Social Grit in Children with Autism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhonghao Shi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Manwei Cao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+S">Sophia Pei</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xiaoyang Qiao</a>, <a href="/search/cs?searchtype=author&query=Groechel%2C+T+R">Thomas R Groechel</a>, <a href="/search/cs?searchtype=author&query=Matari%C4%87%2C+M+J">Maja J Matari膰</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.15256v1-abstract-short" style="display: inline;"> Affect-aware socially assistive robotics (SAR) tutors have great potential to augment and democratize professional therapeutic interventions for children with autism spectrum disorders (ASD) from different socioeconomic backgrounds. However, the majority of research on SAR for ASD has been on teaching cognitive and/or social skills, not on addressing users' emotional needs for real-world social si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.15256v1-abstract-full').style.display = 'inline'; document.getElementById('2103.15256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.15256v1-abstract-full" style="display: none;"> Affect-aware socially assistive robotics (SAR) tutors have great potential to augment and democratize professional therapeutic interventions for children with autism spectrum disorders (ASD) from different socioeconomic backgrounds. However, the majority of research on SAR for ASD has been on teaching cognitive and/or social skills, not on addressing users' emotional needs for real-world social situations. To bridge that gap, this work aims to develop personalized affect-aware SAR tutors to help alleviate social anxiety and foster social grit-the growth mindset for social skill development-in children with ASD. We propose a novel paradigm to incorporate clinically validated Acceptance and Commitment Training (ACT) with personalized SAR interventions. This work paves the way toward developing personalized affect-aware SAR interventions to support the unique and diverse socio-emotional needs and challenges of children with ASD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.15256v1-abstract-full').style.display = 'none'; document.getElementById('2103.15256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACM/IEEE International Conference on Human-Robot Interaction Workshop on Child-Robot Interaction and Child's Fundamental Rights</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.08177">arXiv:2101.08177</a> <span> [<a href="https://arxiv.org/pdf/2101.08177">pdf</a>, <a href="https://arxiv.org/format/2101.08177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Provable Backdoor Defense in Collaborative Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Ximing Qiao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuhua Bai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Siping Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.08177v1-abstract-short" style="display: inline;"> As collaborative learning allows joint training of a model using multiple sources of data, the security problem has been a central concern. Malicious users can upload poisoned data to prevent the model's convergence or inject hidden backdoors. The so-called backdoor attacks are especially difficult to detect since the model behaves normally on standard test data but gives wrong outputs when trigge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.08177v1-abstract-full').style.display = 'inline'; document.getElementById('2101.08177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.08177v1-abstract-full" style="display: none;"> As collaborative learning allows joint training of a model using multiple sources of data, the security problem has been a central concern. Malicious users can upload poisoned data to prevent the model's convergence or inject hidden backdoors. The so-called backdoor attacks are especially difficult to detect since the model behaves normally on standard test data but gives wrong outputs when triggered by certain backdoor keys. Although Byzantine-tolerant training algorithms provide convergence guarantee, provable defense against backdoor attacks remains largely unsolved. Methods based on randomized smoothing can only correct a small number of corrupted pixels or labels; methods based on subset aggregation cause a severe drop in classification accuracy due to low data utilization. We propose a novel framework that generalizes existing subset aggregation methods. The framework shows that the subset selection process, a deciding factor for subset aggregation methods, can be viewed as a code design problem. We derive the theoretical bound of data utilization ratio and provide optimal code construction. Experiments on non-IID versions of MNIST and CIFAR-10 show that our method with optimal codes significantly outperforms baselines using non-overlapping partition and random selection. Additionally, integration with existing coding theory results shows that special codes can track the location of the attackers. Such capability provides new countermeasures to backdoor attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.08177v1-abstract-full').style.display = 'none'; document.getElementById('2101.08177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Qiao%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Qiao%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Qiao%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository