CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 102 results for author: <span class="mathjax">Qin, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Qin%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Qin, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Qin%2C+Y&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Qin, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06808">arXiv:2411.06808</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06808">pdf</a>, <a href="https://arxiv.org/format/2411.06808">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Modeling and Detection of Critical Slowing Down in Epileptic Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yuzhen Qin</a>, <a href="/search/eess?searchtype=author&amp;query=van+Gerven%2C+M">Marcel van Gerven</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06808v1-abstract-short" style="display: inline;"> Epilepsy is a common neurological disorder characterized by abrupt seizures. Although seizures may appear random, they are often preceded by early warning signs in neural signals, notably, critical slowing down, a phenomenon in which the system&#39;s recovery rate from perturbations declines when it approaches a critical point. Detecting these markers could enable preventive therapies. This paper intr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06808v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06808v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06808v1-abstract-full" style="display: none;"> Epilepsy is a common neurological disorder characterized by abrupt seizures. Although seizures may appear random, they are often preceded by early warning signs in neural signals, notably, critical slowing down, a phenomenon in which the system&#39;s recovery rate from perturbations declines when it approaches a critical point. Detecting these markers could enable preventive therapies. This paper introduces a multi-stable slow-fast system to capture critical slowing down in epileptic dynamics. We construct regions of attraction for stable states, shedding light on how dynamic bifurcations drive pathological oscillations. We derive the recovery rate after perturbations to formalize critical slowing down. A novel algorithm for detecting precursors to ictal transitions is presented, along with a proof-of-concept event-based feedback control strategy to prevent impending pathological oscillations. Numerical studies are conducted to validate our theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06808v1-abstract-full').style.display = 'none'; document.getElementById('2411.06808v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ECC2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18584">arXiv:2409.18584</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18584">pdf</a>, <a href="https://arxiv.org/format/2409.18584">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ChildMandarin: A Comprehensive Mandarin Speech Dataset for Young Children Aged 3-5 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+J">Jiabei He</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Cheng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yujie Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18584v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children&#39;s speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18584v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18584v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18584v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children&#39;s speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused on children aged 3 to 5, addressing the scarcity of resources in this area. The dataset comprises 41.25 hours of speech with carefully crafted manual transcriptions, collected from 397 speakers across various provinces in China, with balanced gender representation. We provide a comprehensive analysis of speaker demographics, speech duration distribution and geographic coverage. Additionally, we evaluate ASR performance on models trained from scratch, such as Conformer, as well as fine-tuned pre-trained models like HuBERT and Whisper, where fine-tuning demonstrates significant performance improvements. Furthermore, we assess speaker verification (SV) on our dataset, showing that, despite the challenges posed by the unique vocal characteristics of young children, the dataset effectively supports both ASR and SV tasks. This dataset is a valuable contribution to Mandarin child speech research and holds potential for applications in educational technology and child-computer interaction. It will be open-source and freely available for all academic purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18584v2-abstract-full').style.display = 'none'; document.getElementById('2409.18584v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12992">arXiv:2409.12992</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12992">pdf</a>, <a href="https://arxiv.org/format/2409.12992">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiffEditor: Enhancing Speech Editing with Semantic Enrichment and Acoustic Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Haoran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+J">Jiarong Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12992v1-abstract-short" style="display: inline;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12992v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12992v1-abstract-full" style="display: none;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to enhance performance in OOD text scenarios through semantic enrichment and acoustic consistency. To improve the intelligibility of the edited speech, we enrich the semantic information of phoneme embeddings by integrating word embeddings extracted from a pretrained language model. Furthermore, we emphasize that interframe smoothing properties are critical for modeling acoustic consistency, and thus we propose a first-order loss function to promote smoother transitions at editing boundaries and enhance the overall fluency of the edited speech. Experimental results demonstrate that our model achieves state-of-the-art performance in both in-domain and OOD text scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'none'; document.getElementById('2409.12992v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12466">arXiv:2409.12466</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12466">pdf</a>, <a href="https://arxiv.org/format/2409.12466">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AudioEditor: A Training-Free Diffusion-Based Audio Editing Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+J">Jinghua Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12466v2-abstract-short" style="display: inline;"> Diffusion-based text-to-audio (TTA) generation has made substantial progress, leveraging latent diffusion model (LDM) to produce high-quality, diverse and instruction-relevant audios. However, beyond generation, the task of audio editing remains equally important but has received comparatively little attention. Audio editing tasks face two primary challenges: executing precise edits and preserving&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12466v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12466v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12466v2-abstract-full" style="display: none;"> Diffusion-based text-to-audio (TTA) generation has made substantial progress, leveraging latent diffusion model (LDM) to produce high-quality, diverse and instruction-relevant audios. However, beyond generation, the task of audio editing remains equally important but has received comparatively little attention. Audio editing tasks face two primary challenges: executing precise edits and preserving the unedited sections. While workflows based on LDMs have effectively addressed these challenges in the field of image processing, similar approaches have been scarcely applied to audio editing. In this paper, we introduce AudioEditor, a training-free audio editing framework built on the pretrained diffusion-based TTA model. AudioEditor incorporates Null-text Inversion and EOT-suppression methods, enabling the model to preserve original audio features while executing accurate edits. Comprehensive objective and subjective experiments validate the effectiveness of AudioEditor in delivering high-quality audio edits. Code and demo can be found at https://github.com/NKU-HLT/AudioEditor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12466v2-abstract-full').style.display = 'none'; document.getElementById('2409.12466v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11889">arXiv:2409.11889</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11889">pdf</a>, <a href="https://arxiv.org/format/2409.11889">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> M2R-Whisper: Multi-stage and Multi-scale Retrieval Augmentation for Enhancing Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+J">Jiabei He</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11889v1-abstract-short" style="display: inline;"> State-of-the-art models like OpenAI&#39;s Whisper exhibit strong performance in multilingual automatic speech recognition (ASR), but they still face challenges in accurately recognizing diverse subdialects. In this paper, we propose M2R-whisper, a novel multi-stage and multi-scale retrieval augmentation approach designed to enhance ASR performance in low-resource settings. Building on the principles o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11889v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11889v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11889v1-abstract-full" style="display: none;"> State-of-the-art models like OpenAI&#39;s Whisper exhibit strong performance in multilingual automatic speech recognition (ASR), but they still face challenges in accurately recognizing diverse subdialects. In this paper, we propose M2R-whisper, a novel multi-stage and multi-scale retrieval augmentation approach designed to enhance ASR performance in low-resource settings. Building on the principles of in-context learning (ICL) and retrieval-augmented techniques, our method employs sentence-level ICL in the pre-processing stage to harness contextual information, while integrating token-level k-Nearest Neighbors (kNN) retrieval as a post-processing step to further refine the final output distribution. By synergistically combining sentence-level and token-level retrieval strategies, M2R-whisper effectively mitigates various types of recognition errors. Experiments conducted on Mandarin and subdialect datasets, including AISHELL-1 and KeSpeech, demonstrate substantial improvements in ASR accuracy, all achieved without any parameter updates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11889v1-abstract-full').style.display = 'none'; document.getElementById('2409.11889v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10048">arXiv:2409.10048</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.10048">pdf</a>, <a href="https://arxiv.org/format/2409.10048">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Driven Reinforcement Learning for Head-Orientation in Naturalistic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ledder%2C+W">Wessel Ledder</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yuzhen Qin</a>, <a href="/search/eess?searchtype=author&amp;query=van+der+Heijden%2C+K">Kiki van der Heijden</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10048v1-abstract-short" style="display: inline;"> Although deep reinforcement learning (DRL) approaches in audio signal processing have seen substantial progress in recent years, audio-driven DRL for tasks such as navigation, gaze control and head-orientation control in the context of human-robot interaction have received little attention. Here, we propose an audio-driven DRL framework in which we utilise deep Q-learning to develop an autonomous&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10048v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10048v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10048v1-abstract-full" style="display: none;"> Although deep reinforcement learning (DRL) approaches in audio signal processing have seen substantial progress in recent years, audio-driven DRL for tasks such as navigation, gaze control and head-orientation control in the context of human-robot interaction have received little attention. Here, we propose an audio-driven DRL framework in which we utilise deep Q-learning to develop an autonomous agent that orients towards a talker in the acoustic environment based on stereo speech recordings. Our results show that the agent learned to perform the task at a near perfect level when trained on speech segments in anechoic environments (that is, without reverberation). The presence of reverberation in naturalistic acoustic environments affected the agent&#39;s performance, although the agent still substantially outperformed a baseline, randomly acting agent. Finally, we quantified the degree of generalization of the proposed DRL approach across naturalistic acoustic environments. Our experiments revealed that policies learned by agents trained on medium or high reverb environments generalized to low reverb environments, but policies learned by agents trained on anechoic or low reverb environments did not generalize to medium or high reverb environments. Taken together, this study demonstrates the potential of audio-driven DRL for tasks such as head-orientation control and highlights the need for training strategies that enable robust generalization across environments for real-world audio-driven DRL applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10048v1-abstract-full').style.display = 'none'; document.getElementById('2409.10048v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05430">arXiv:2409.05430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05430">pdf</a>, <a href="https://arxiv.org/format/2409.05430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Findings of the 2024 Mandarin Stuttering Event Detection and Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+B">Bin Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05430v1-abstract-short" style="display: inline;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech;&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05430v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05430v1-abstract-full" style="display: none;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech; and (3) Research track for innovative approaches utilizing the provided dataset. We utilizes an open-source Mandarin stuttering dataset AS-70, which has been split into new training and test sets for the challenge. This paper presents the dataset, details the challenge tracks, and analyzes the performance of the top systems, highlighting improvements in detection accuracy and reductions in recognition error rates. Our findings underscore the potential of specialized models and augmentation strategies in developing stuttered speech technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'none'; document.getElementById('2409.05430v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04799">arXiv:2409.04799</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.04799">pdf</a>, <a href="https://arxiv.org/format/2409.04799">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PB-LRDWWS System for the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04799v1-abstract-short" style="display: inline;"> For the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting (LRDWWS) Challenge, we introduce the PB-LRDWWS system. This system combines a dysarthric speech content feature extractor for prototype construction with a prototype-based classification method. The feature extractor is a fine-tuned HuBERT model obtained through a three-stage fine-tuning process using cross-entropy loss. This fine-tune&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04799v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04799v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04799v1-abstract-full" style="display: none;"> For the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting (LRDWWS) Challenge, we introduce the PB-LRDWWS system. This system combines a dysarthric speech content feature extractor for prototype construction with a prototype-based classification method. The feature extractor is a fine-tuned HuBERT model obtained through a three-stage fine-tuning process using cross-entropy loss. This fine-tuned HuBERT extracts features from the target dysarthric speaker&#39;s enrollment speech to build prototypes. Classification is achieved by calculating the cosine similarity between the HuBERT features of the target dysarthric speaker&#39;s evaluation speech and prototypes. Despite its simplicity, our method demonstrates effectiveness through experimental results. Our system achieves second place in the final Test-B of the LRDWWS Challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04799v1-abstract-full').style.display = 'none'; document.getElementById('2409.04799v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00141">arXiv:2409.00141</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00141">pdf</a>, <a href="https://arxiv.org/format/2409.00141">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.est.2024.113502">10.1016/j.est.2024.113502 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Graph neural network-based lithium-ion battery state of health estimation using partial discharging curve </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+K+Q">Kate Qi Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yan Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00141v1-abstract-short" style="display: inline;"> Data-driven methods have gained extensive attention in estimating the state of health (SOH) of lithium-ion batteries. Accurate SOH estimation requires degradation-relevant features and alignment of statistical distributions between training and testing datasets. However, current research often overlooks these needs and relies on arbitrary voltage segment selection. To address these challenges, thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00141v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00141v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00141v1-abstract-full" style="display: none;"> Data-driven methods have gained extensive attention in estimating the state of health (SOH) of lithium-ion batteries. Accurate SOH estimation requires degradation-relevant features and alignment of statistical distributions between training and testing datasets. However, current research often overlooks these needs and relies on arbitrary voltage segment selection. To address these challenges, this paper introduces an innovative approach leveraging spatio-temporal degradation dynamics via graph convolutional networks (GCNs). Our method systematically selects discharge voltage segments using the Matrix Profile anomaly detection algorithm, eliminating the need for manual selection and preventing information loss. These selected segments form a fundamental structure integrated into the GCN-based SOH estimation model, capturing inter-cycle dynamics and mitigating statistical distribution incongruities between offline training and online testing data. Validation with a widely accepted open-source dataset demonstrates that our method achieves precise SOH estimation, with a root mean squared error of less than 1%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00141v1-abstract-full').style.display = 'none'; document.getElementById('2409.00141v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Energy Storage, Volume 100, Part A, 15 October 2024, 113502 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12829">arXiv:2408.12829</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12829">pdf</a>, <a href="https://arxiv.org/format/2408.12829">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Mean Opinion Score Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+X">Xiguang Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12829v1-abstract-short" style="display: inline;"> Mean Opinion Score (MOS) prediction has made significant progress in specific domains. However, the unstable performance of MOS prediction models across diverse samples presents ongoing challenges in the practical application of these systems. In this paper, we point out that the absence of uncertainty modeling is a significant limitation hindering MOS prediction systems from applying to the real&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12829v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12829v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12829v1-abstract-full" style="display: none;"> Mean Opinion Score (MOS) prediction has made significant progress in specific domains. However, the unstable performance of MOS prediction models across diverse samples presents ongoing challenges in the practical application of these systems. In this paper, we point out that the absence of uncertainty modeling is a significant limitation hindering MOS prediction systems from applying to the real and open world. We analyze the sources of uncertainty in the MOS prediction task and propose to establish an uncertainty-aware MOS prediction system that models aleatory uncertainty and epistemic uncertainty by heteroscedastic regression and Monte Carlo dropout separately. The experimental results show that the system captures uncertainty well and is capable of performing selective prediction and out-of-domain detection. Such capabilities significantly enhance the practical utility of MOS systems in diverse real and open-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12829v1-abstract-full').style.display = 'none'; document.getElementById('2408.12829v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024, oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09132">arXiv:2408.09132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.09132">pdf</a>, <a href="https://arxiv.org/format/2408.09132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RIS-based Over-the-air Diffractional Channel Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hui%2C+Y">Yingzhe Hui</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Shuyi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yifan Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+W">Weixiao Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Q">Qiushi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+W">Wei Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09132v1-abstract-short" style="display: inline;"> Reconfigurable Intelligent Surfaces (RIS) are programmable metasurfaces utilizing sub-wavelength meta-atoms and a controller for precise electromagnetic wave manipulation. This work introduces an innovative channel coding scheme, termed RIS-based diffractional channel coding (DCC), which capitalizes on diffraction between two RIS layers for signal-level encoding. Contrary to traditional methods, D&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09132v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09132v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09132v1-abstract-full" style="display: none;"> Reconfigurable Intelligent Surfaces (RIS) are programmable metasurfaces utilizing sub-wavelength meta-atoms and a controller for precise electromagnetic wave manipulation. This work introduces an innovative channel coding scheme, termed RIS-based diffractional channel coding (DCC), which capitalizes on diffraction between two RIS layers for signal-level encoding. Contrary to traditional methods, DCC expands signal dimensions through diffraction, presenting a novel countermeasure to channel effects. This paper focuses on the operational principles of DCC, including encoder and decoder designs, and explores its possibilities to construct block and trellis codes, demonstrating its potential as both an alternative and a supplementary conventional coding scheme. Key advantages of DCC include eliminating extra power requirements for encoding, achieving computation at the speed of light, and enabling adjustable code distance, making it a progressive solution for efficient wireless communication, particularly in systems with large-scale data or massive MIMO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09132v1-abstract-full').style.display = 'none'; document.getElementById('2408.09132v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures, accepted by IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02085">arXiv:2408.02085</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.02085">pdf</a>, <a href="https://arxiv.org/format/2408.02085">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Power of Data Tsunami: A Comprehensive Survey on Data Assessment and Selection for Instruction Tuning of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yulei Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yuncheng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Gang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+H">Hang Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+Y">Yuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zihan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+Y">Yun Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+K">Ke Li</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xing Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02085v3-abstract-short" style="display: inline;"> Instruction tuning plays a critical role in aligning large language models (LLMs) with human preference. Despite the vast amount of open instruction datasets, naively training a LLM on all existing instructions may not be optimal and practical. To pinpoint the most beneficial datapoints, data assessment and selection methods have been proposed in the fields of natural language processing (NLP) and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02085v3-abstract-full').style.display = 'inline'; document.getElementById('2408.02085v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02085v3-abstract-full" style="display: none;"> Instruction tuning plays a critical role in aligning large language models (LLMs) with human preference. Despite the vast amount of open instruction datasets, naively training a LLM on all existing instructions may not be optimal and practical. To pinpoint the most beneficial datapoints, data assessment and selection methods have been proposed in the fields of natural language processing (NLP) and deep learning. However, under the context of instruction tuning, there still exists a gap in knowledge on what kind of data evaluation metrics can be employed and how they can be integrated into the selection mechanism. To bridge this gap, we present a comprehensive review on existing literature of data assessment and selection especially for instruction tuning of LLMs. We systematically categorize all applicable methods into quality-based, diversity-based, and importance-based ones where a unified, fine-grained taxonomy is structured. For each category, representative methods are elaborated to describe the landscape of relevant research. In addition, comparison between latest methods is conducted on their officially reported results to provide in-depth discussions on their limitations. Finally, we summarize the open challenges and propose the promosing avenues for future studies. All related contents are available at https://github.com/yuleiqin/fantastic-data-engineering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02085v3-abstract-full').style.display = 'none'; document.getElementById('2408.02085v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">review, survey, 28 pages, 2 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00325">arXiv:2408.00325</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00325">pdf</a>, <a href="https://arxiv.org/format/2408.00325">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Iterative Prototype Refinement for Ambiguous Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+X">Xiangyu Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00325v1-abstract-short" style="display: inline;"> Recognizing emotions from speech is a daunting task due to the subtlety and ambiguity of expressions. Traditional speech emotion recognition (SER) systems, which typically rely on a singular, precise emotion label, struggle with this complexity. Therefore, modeling the inherent ambiguity of emotions is an urgent problem. In this paper, we propose an iterative prototype refinement framework (IPR) f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00325v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00325v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00325v1-abstract-full" style="display: none;"> Recognizing emotions from speech is a daunting task due to the subtlety and ambiguity of expressions. Traditional speech emotion recognition (SER) systems, which typically rely on a singular, precise emotion label, struggle with this complexity. Therefore, modeling the inherent ambiguity of emotions is an urgent problem. In this paper, we propose an iterative prototype refinement framework (IPR) for ambiguous SER. IPR comprises two interlinked components: contrastive learning and class prototypes. The former provides an efficient way to obtain high-quality representations of ambiguous samples. The latter are dynamically updated based on ambiguous labels -- the similarity of the ambiguous data to all prototypes. These refined embeddings yield precise pseudo labels, thus reinforcing representation quality. Experimental evaluations conducted on the IEMOCAP dataset validate the superior performance of IPR over state-of-the-art methods, thus proving the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00325v1-abstract-full').style.display = 'none'; document.getElementById('2408.00325v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18461">arXiv:2407.18461</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.18461">pdf</a>, <a href="https://arxiv.org/format/2407.18461">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1360">10.21437/Interspeech.2024-1360 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18461v1-abstract-short" style="display: inline;"> Dysarthric speech recognition (DSR) presents a formidable challenge due to inherent inter-speaker variability, leading to severe performance degradation when applying DSR models to new dysarthric speakers. Traditional speaker adaptation methodologies typically involve fine-tuning models for each speaker, but this strategy is cost-prohibitive and inconvenient for disabled users, requiring substanti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18461v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18461v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18461v1-abstract-full" style="display: none;"> Dysarthric speech recognition (DSR) presents a formidable challenge due to inherent inter-speaker variability, leading to severe performance degradation when applying DSR models to new dysarthric speakers. Traditional speaker adaptation methodologies typically involve fine-tuning models for each speaker, but this strategy is cost-prohibitive and inconvenient for disabled users, requiring substantial data collection. To address this issue, we introduce a prototype-based approach that markedly improves DSR performance for unseen dysarthric speakers without additional fine-tuning. Our method employs a feature extractor trained with HuBERT to produce per-word prototypes that encapsulate the characteristics of previously unseen speakers. These prototypes serve as the basis for classification. Additionally, we incorporate supervised contrastive learning to refine feature extraction. By enhancing representation quality, we further improve DSR performance, enabling effective personalized DSR. We release our code at https://github.com/NKU-HLT/PB-DSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18461v1-abstract-full').style.display = 'none'; document.getElementById('2407.18461v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> INTERSPEECH 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09029">arXiv:2407.09029</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.09029">pdf</a>, <a href="https://arxiv.org/format/2407.09029">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Emotion Recognition in Incomplete Data: A Novel Cross-Modal Alignment, Reconstruction, and Refinement Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+S">Shaokai Li</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+X">Xiangyu Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09029v1-abstract-short" style="display: inline;"> Multimodal emotion recognition systems rely heavily on the full availability of modalities, suffering significant performance declines when modal data is incomplete. To tackle this issue, we present the Cross-Modal Alignment, Reconstruction, and Refinement (CM-ARR) framework, an innovative approach that sequentially engages in cross-modal alignment, reconstruction, and refinement phases to handle&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09029v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09029v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09029v1-abstract-full" style="display: none;"> Multimodal emotion recognition systems rely heavily on the full availability of modalities, suffering significant performance declines when modal data is incomplete. To tackle this issue, we present the Cross-Modal Alignment, Reconstruction, and Refinement (CM-ARR) framework, an innovative approach that sequentially engages in cross-modal alignment, reconstruction, and refinement phases to handle missing modalities and enhance emotion recognition. This framework utilizes unsupervised distribution-based contrastive learning to align heterogeneous modal distributions, reducing discrepancies and modeling semantic uncertainty effectively. The reconstruction phase applies normalizing flow models to transform these aligned distributions and recover missing modalities. The refinement phase employs supervised point-based contrastive learning to disrupt semantic correlations and accentuate emotional traits, thereby enriching the affective content of the reconstructed representations. Extensive experiments on the IEMOCAP and MSP-IMPROV datasets confirm the superior performance of CM-ARR under conditions of both missing and complete modalities. Notably, averaged across six scenarios of missing modalities, CM-ARR achieves absolute improvements of 2.11% in WAR and 2.12% in UAR on the IEMOCAP dataset, and 1.71% and 1.96% in WAR and UAR, respectively, on the MSP-IMPROV dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09029v1-abstract-full').style.display = 'none'; document.getElementById('2407.09029v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08203">arXiv:2406.08203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08203">pdf</a>, <a href="https://arxiv.org/format/2406.08203">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> LAFMA: A Latent Flow Matching Model for Text-to-Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guan%2C+W">Wenhao Guan</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+K">Kaidi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+W">Wangjin Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+F">Feng Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Lin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Hong%2C+Q">Qingyang Hong</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08203v1-abstract-short" style="display: inline;"> Recently, the application of diffusion models has facilitated the significant development of speech and audio generation. Nevertheless, the quality of samples generated by diffusion models still needs improvement. And the effectiveness of the method is accompanied by the extensive number of sampling steps, leading to an extended synthesis time necessary for generating high-quality audio. Previous&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08203v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08203v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08203v1-abstract-full" style="display: none;"> Recently, the application of diffusion models has facilitated the significant development of speech and audio generation. Nevertheless, the quality of samples generated by diffusion models still needs improvement. And the effectiveness of the method is accompanied by the extensive number of sampling steps, leading to an extended synthesis time necessary for generating high-quality audio. Previous Text-to-Audio (TTA) methods mostly used diffusion models in the latent space for audio generation. In this paper, we explore the integration of the Flow Matching (FM) model into the audio latent space for audio generation. The FM is an alternative simulation-free method that trains continuous normalization flows (CNF) based on regressing vector fields. We demonstrate that our model significantly enhances the quality of generated audio samples, achieving better performance than prior models. Moreover, it reduces the number of inference steps to ten steps almost without sacrificing performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08203v1-abstract-full').style.display = 'none'; document.getElementById('2406.08203v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07256">arXiv:2406.07256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07256">pdf</a>, <a href="https://arxiv.org/ps/2406.07256">ps</a>, <a href="https://arxiv.org/format/2406.07256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Q">Qisheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shaomei Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Bin%2C+J">Jia Bin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+M">Ming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07256v1-abstract-short" style="display: inline;"> The rapid advancements in speech technologies over the past two decades have led to human-level performance in tasks like automatic speech recognition (ASR) for fluent speech. However, the efficacy of these models diminishes when applied to atypical speech, such as stuttering. This paper introduces AS-70, the first publicly available Mandarin stuttered speech dataset, which stands out as the large&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07256v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07256v1-abstract-full" style="display: none;"> The rapid advancements in speech technologies over the past two decades have led to human-level performance in tasks like automatic speech recognition (ASR) for fluent speech. However, the efficacy of these models diminishes when applied to atypical speech, such as stuttering. This paper introduces AS-70, the first publicly available Mandarin stuttered speech dataset, which stands out as the largest dataset in its category. Encompassing conversational and voice command reading speech, AS-70 includes verbatim manual transcription, rendering it suitable for various speech-related tasks. Furthermore, baseline systems are established, and experimental results are presented for ASR and stuttering event detection (SED) tasks. By incorporating this dataset into the model fine-tuning, significant improvements in the state-of-the-art ASR models, e.g., Whisper and Hubert, are observed, enhancing their inclusivity in addressing stuttered speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07256v1-abstract-full').style.display = 'none'; document.getElementById('2406.07256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03814">arXiv:2406.03814</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03814">pdf</a>, <a href="https://arxiv.org/format/2406.03814">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and Gated Monolingual Datastores </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+T">Tian-Hao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03814v2-abstract-short" style="display: inline;"> The kNN-CTC model has proven to be effective for monolingual automatic speech recognition (ASR). However, its direct application to multilingual scenarios like code-switching, presents challenges. Although there is potential for performance improvement, a kNN-CTC model utilizing a single bilingual datastore can inadvertently introduce undesirable noise from the alternative language. To address thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03814v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03814v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03814v2-abstract-full" style="display: none;"> The kNN-CTC model has proven to be effective for monolingual automatic speech recognition (ASR). However, its direct application to multilingual scenarios like code-switching, presents challenges. Although there is potential for performance improvement, a kNN-CTC model utilizing a single bilingual datastore can inadvertently introduce undesirable noise from the alternative language. To address this, we propose a novel kNN-CTC-based code-switching ASR (CS-ASR) framework that employs dual monolingual datastores and a gated datastore selection mechanism to reduce noise interference. Our method selects the appropriate datastore for decoding each frame, ensuring the injection of language-specific information into the ASR process. We apply this framework to cutting-edge CTC-based models, developing an advanced CS-ASR system. Extensive experiments demonstrate the remarkable effectiveness of our gated datastore mechanism in enhancing the performance of zero-shot Chinese-English CS-ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03814v2-abstract-full').style.display = 'none'; document.getElementById('2406.03814v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14900">arXiv:2405.14900</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.14900">pdf</a>, <a href="https://arxiv.org/format/2405.14900">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.media.2024.103206.">10.1016/j.media.2024.103206. <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fair Evaluation of Federated Learning Algorithms for Automated Breast Density Classification: The Results of the 2022 ACR-NCI-NVIDIA Federated Learning Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Schmidt%2C+K">Kendall Schmidt</a>, <a href="/search/eess?searchtype=author&amp;query=Bearce%2C+B">Benjamin Bearce</a>, <a href="/search/eess?searchtype=author&amp;query=Chang%2C+K">Ken Chang</a>, <a href="/search/eess?searchtype=author&amp;query=Coombs%2C+L">Laura Coombs</a>, <a href="/search/eess?searchtype=author&amp;query=Farahani%2C+K">Keyvan Farahani</a>, <a href="/search/eess?searchtype=author&amp;query=Elbatele%2C+M">Marawan Elbatele</a>, <a href="/search/eess?searchtype=author&amp;query=Mouhebe%2C+K">Kaouther Mouhebe</a>, <a href="/search/eess?searchtype=author&amp;query=Marti%2C+R">Robert Marti</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+R">Ruipeng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yanfeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Y">Yaojun Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Ying%2C+H">Haochao Ying</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yuyang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Testagrose%2C+C">Conrad Testagrose</a>, <a href="/search/eess?searchtype=author&amp;query=Demirer%2C+M">Mutlu Demirer</a>, <a href="/search/eess?searchtype=author&amp;query=Gupta%2C+V">Vikash Gupta</a>, <a href="/search/eess?searchtype=author&amp;query=Ak%C3%BCnal%2C+%C3%9C">脺nal Ak眉nal</a>, <a href="/search/eess?searchtype=author&amp;query=Bujotzek%2C+M">Markus Bujotzek</a>, <a href="/search/eess?searchtype=author&amp;query=Maier-Hein%2C+K+H">Klaus H. Maier-Hein</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yi Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaomeng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Kalpathy-Cramer%2C+J">Jayashree Kalpathy-Cramer</a>, <a href="/search/eess?searchtype=author&amp;query=Roth%2C+H+R">Holger R. Roth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14900v1-abstract-short" style="display: inline;"> The correct interpretation of breast density is important in the assessment of breast cancer risk. AI has been shown capable of accurately predicting breast density, however, due to the differences in imaging characteristics across mammography systems, models built using data from one system do not generalize well to other systems. Though federated learning (FL) has emerged as a way to improve the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14900v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14900v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14900v1-abstract-full" style="display: none;"> The correct interpretation of breast density is important in the assessment of breast cancer risk. AI has been shown capable of accurately predicting breast density, however, due to the differences in imaging characteristics across mammography systems, models built using data from one system do not generalize well to other systems. Though federated learning (FL) has emerged as a way to improve the generalizability of AI without the need to share data, the best way to preserve features from all training data during FL is an active area of research. To explore FL methodology, the breast density classification FL challenge was hosted in partnership with the American College of Radiology, Harvard Medical School&#39;s Mass General Brigham, University of Colorado, NVIDIA, and the National Institutes of Health National Cancer Institute. Challenge participants were able to submit docker containers capable of implementing FL on three simulated medical facilities, each containing a unique large mammography dataset. The breast density FL challenge ran from June 15 to September 5, 2022, attracting seven finalists from around the world. The winning FL submission reached a linear kappa score of 0.653 on the challenge test data and 0.413 on an external testing dataset, scoring comparably to a model trained on the same data in a central location. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14900v1-abstract-full').style.display = 'none'; document.getElementById('2405.14900v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Medical Image Analysis Volume 95, July 2024, 103206 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06746">arXiv:2404.06746</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.06746">pdf</a>, <a href="https://arxiv.org/format/2404.06746">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Data-driven parallel Koopman subsystem modeling and distributed moving horizon state estimation for large-scale nonlinear processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaojie Li</a>, <a href="/search/eess?searchtype=author&amp;query=Bo%2C+S">Song Bo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xuewen Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yan Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+X">Xunyuan Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06746v1-abstract-short" style="display: inline;"> In this work, we consider a state estimation problem for large-scale nonlinear processes in the absence of first-principles process models. By exploiting process operation data, both process modeling and state estimation design are addressed within a distributed framework. By leveraging the Koopman operator concept, a parallel subsystem modeling approach is proposed to establish interactive linear&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06746v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06746v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06746v1-abstract-full" style="display: none;"> In this work, we consider a state estimation problem for large-scale nonlinear processes in the absence of first-principles process models. By exploiting process operation data, both process modeling and state estimation design are addressed within a distributed framework. By leveraging the Koopman operator concept, a parallel subsystem modeling approach is proposed to establish interactive linear subsystem process models in higher-dimensional subspaces, each of which correlates with the original nonlinear subspace of the corresponding process subsystem via a nonlinear mapping. The data-driven linear subsystem models can be used to collaboratively characterize and predict the dynamical behaviors of the entire nonlinear process. Based on the established subsystem models, local state estimators that can explicitly handle process operation constraints are designed using moving horizon estimation. The local estimators are integrated via information exchange to form a distributed estimation scheme, which provides estimates of the unmeasured/unmeasurable state variables of the original nonlinear process in a linear manner. The proposed framework is applied to a chemical process and an agro-hydrological process to illustrate its effectiveness and applicability. Good open-loop predictability of the linear subsystem models is confirmed, and accurate estimates of the process states are obtained without requiring a first-principles process model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06746v1-abstract-full').style.display = 'none'; document.getElementById('2404.06746v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06706">arXiv:2404.06706</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.06706">pdf</a>, <a href="https://arxiv.org/ps/2404.06706">ps</a>, <a href="https://arxiv.org/format/2404.06706">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Iterative distributed moving horizon estimation of linear systems with penalties on both system disturbances and noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaojie Li</a>, <a href="/search/eess?searchtype=author&amp;query=Bo%2C+S">Song Bo</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yan Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+X">Xunyuan Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06706v1-abstract-short" style="display: inline;"> In this paper, partition-based distributed state estimation of general linear systems is considered. A distributed moving horizon state estimation scheme is developed via decomposing the entire system model into subsystem models and partitioning the global objective function of centralized moving horizon estimation (MHE) into local objective functions. The subsystem estimators of the distributed s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06706v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06706v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06706v1-abstract-full" style="display: none;"> In this paper, partition-based distributed state estimation of general linear systems is considered. A distributed moving horizon state estimation scheme is developed via decomposing the entire system model into subsystem models and partitioning the global objective function of centralized moving horizon estimation (MHE) into local objective functions. The subsystem estimators of the distributed scheme that are required to be executed iteratively within each sampling period are designed based on MHE. Two distributed MHE algorithms are proposed to handle the unconstrained case and the case when hard constraints on states and disturbances, respectively. Sufficient conditions on the convergence of the estimates and the stability of the estimation error dynamics for the entire system are derived for both cases. A benchmark reactor-separator process example is introduced to illustrate the proposed distributed state estimation approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06706v1-abstract-full').style.display = 'none'; document.getElementById('2404.06706v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05911">arXiv:2404.05911</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.05911">pdf</a>, <a href="https://arxiv.org/format/2404.05911">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions for Brain Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Alwadee%2C+E+J">Ebtihal J. Alwadee</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xianfang Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yipeng Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Langbein%2C+F+C">Frank C. Langbein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05911v1-abstract-short" style="display: inline;"> Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI) scans is crucial for prompt and effective treatment. However, this process faces the challenge of precise delineation due to the tumors&#39; complex heterogeneity. Moreover, energy sustainability targets and resource limitations, especially in developing countries, require efficient and accessible medical imaging solutions.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05911v1-abstract-full').style.display = 'inline'; document.getElementById('2404.05911v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05911v1-abstract-full" style="display: none;"> Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI) scans is crucial for prompt and effective treatment. However, this process faces the challenge of precise delineation due to the tumors&#39; complex heterogeneity. Moreover, energy sustainability targets and resource limitations, especially in developing countries, require efficient and accessible medical imaging solutions. The proposed architecture, a Lightweight 3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these issues. It is specifically designed to reduce computational requirements significantly while maintaining high segmentation performance. By incorporating parallel convolutions, it enhances feature representation by capturing multi-scale information. It further integrates an attention mechanism to refine segmentation through selective feature recalibration. LATUP-Net achieves promising segmentation performance: the average Dice scores for the whole tumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%, 83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and 83.92%, respectively. Hausdorff distance metrics further indicate its improved ability to delineate tumor boundaries. With its significantly reduced computational demand using only 3.07 M parameters, about 59 times fewer than other state-of-the-art models, and running on a single V100 GPU, LATUP-Net stands out as a promising solution for real-world clinical applications, particularly in settings with limited resources. Investigations into the model&#39;s interpretability, utilizing gradient-weighted class activation mapping and confusion matrices, reveal that while attention mechanisms enhance the segmentation of small regions, their impact is nuanced. Achieving the most accurate tumor delineation requires carefully balancing local and global features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05911v1-abstract-full').style.display = 'none'; document.getElementById('2404.05911v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03409">arXiv:2404.03409</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.03409">pdf</a>, <a href="https://arxiv.org/format/2404.03409">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biological Physics">physics.bio-ph</span> </div> </div> <p class="title is-5 mathjax"> Analytical Characterization of Epileptic Dynamics in a Bistable System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yuzhen Qin</a>, <a href="/search/eess?searchtype=author&amp;query=El-Gazzar%2C+A">Ahmed El-Gazzar</a>, <a href="/search/eess?searchtype=author&amp;query=Bassett%2C+D+S">Danielle S. Bassett</a>, <a href="/search/eess?searchtype=author&amp;query=Pasqualetti%2C+F">Fabio Pasqualetti</a>, <a href="/search/eess?searchtype=author&amp;query=van+Gerven%2C+M">Marcel van Gerven</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03409v1-abstract-short" style="display: inline;"> Epilepsy is one of the most common neurological disorders globally, affecting millions of individuals. Despite significant advancements, the precise mechanisms underlying this condition remain largely unknown, making accurately predicting and preventing epileptic seizures challenging. In this paper, we employ a bistable model, where a stable equilibrium and a stable limit cycle coexist, to describ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03409v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03409v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03409v1-abstract-full" style="display: none;"> Epilepsy is one of the most common neurological disorders globally, affecting millions of individuals. Despite significant advancements, the precise mechanisms underlying this condition remain largely unknown, making accurately predicting and preventing epileptic seizures challenging. In this paper, we employ a bistable model, where a stable equilibrium and a stable limit cycle coexist, to describe epileptic dynamics. The equilibrium captures normal steady-state neural activity, while the stable limit cycle signifies seizure-like oscillations. The noise-driven switch from the equilibrium to the limit cycle characterizes the onset of seizures. The differences in the regions of attraction of these two stable states distinguish epileptic brain dynamics from healthy ones. We analytically construct the regions of attraction for both states. Further, using the notion of input-to-state stability, we theoretically show how the regions of attraction influence the stability of the system subject to external perturbations. Generalizing the bistable system into coupled networks, we also find the role of network parameters in shaping the regions of attraction. Our findings shed light on the intricate interplay between brain networks and epileptic activity, offering mechanistic insights into potential avenues for more predictable treatments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03409v1-abstract-full').style.display = 'none'; document.getElementById('2404.03409v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures, submitted to IEEE CDC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03253">arXiv:2404.03253</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.03253">pdf</a>, <a href="https://arxiv.org/format/2404.03253">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+K">Kai Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+M">Meige Li</a>, <a href="/search/eess?searchtype=author&amp;query=Si%2C+L">Liping Si</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yingwei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xiong%2C+Y">Yu Xiong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qixing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yang Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+L">Ling Xu</a>, <a href="/search/eess?searchtype=author&amp;query=van+der+Smagt%2C+P">Patrick van der Smagt</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Jun Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+N">Nutan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03253v1-abstract-short" style="display: inline;"> Multi-modality magnetic resonance imaging data with various sequences facilitate the early diagnosis, tumor segmentation, and disease staging in the management of nasopharyngeal carcinoma (NPC). The lack of publicly available, comprehensive datasets limits advancements in diagnosis, treatment planning, and the development of machine learning algorithms for NPC. Addressing this critical need, we in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03253v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03253v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03253v1-abstract-full" style="display: none;"> Multi-modality magnetic resonance imaging data with various sequences facilitate the early diagnosis, tumor segmentation, and disease staging in the management of nasopharyngeal carcinoma (NPC). The lack of publicly available, comprehensive datasets limits advancements in diagnosis, treatment planning, and the development of machine learning algorithms for NPC. Addressing this critical need, we introduce the first comprehensive NPC MRI dataset, encompassing MR axial imaging of 277 primary NPC patients. This dataset includes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences, totaling 831 scans. In addition to the corresponding clinical data, manually annotated and labeled segmentations by experienced radiologists offer high-quality data resources from untreated primary NPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03253v1-abstract-full').style.display = 'none'; document.getElementById('2404.03253v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15139">arXiv:2403.15139</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.15139">pdf</a>, <a href="https://arxiv.org/format/2403.15139">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Deep Generative Model based Rate-Distortion for Image Downscaling Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Y">Yuanbang Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Garg%2C+B">Bhavesh Garg</a>, <a href="/search/eess?searchtype=author&amp;query=Rosin%2C+P+L">Paul L Rosin</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yipeng Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15139v1-abstract-short" style="display: inline;"> In this paper, we propose Image Downscaling Assessment by Rate-Distortion (IDA-RD), a novel measure to quantitatively evaluate image downscaling algorithms. In contrast to image-based methods that measure the quality of downscaled images, ours is process-based that draws ideas from rate-distortion theory to measure the distortion incurred during downscaling. Our main idea is that downscaling and s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15139v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15139v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15139v1-abstract-full" style="display: none;"> In this paper, we propose Image Downscaling Assessment by Rate-Distortion (IDA-RD), a novel measure to quantitatively evaluate image downscaling algorithms. In contrast to image-based methods that measure the quality of downscaled images, ours is process-based that draws ideas from rate-distortion theory to measure the distortion incurred during downscaling. Our main idea is that downscaling and super-resolution (SR) can be viewed as the encoding and decoding processes in the rate-distortion model, respectively, and that a downscaling algorithm that preserves more details in the resulting low-resolution (LR) images should lead to less distorted high-resolution (HR) images in SR. In other words, the distortion should increase as the downscaling algorithm deteriorates. However, it is non-trivial to measure this distortion as it requires the SR algorithm to be blind and stochastic. Our key insight is that such requirements can be met by recent SR algorithms based on deep generative models that can find all matching HR images for a given LR image on their learned image manifolds. Extensive experimental results show the effectiveness of our IDA-RD measure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15139v1-abstract-full').style.display = 'none'; document.getElementById('2403.15139v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08247">arXiv:2403.08247</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.08247">pdf</a>, <a href="https://arxiv.org/format/2403.08247">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Dual-domain Regularization Method for Ring Artifact Removal of X-ray CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+H">Hongyang Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+X">Xin Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yanwei Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+X">Xinran Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+T">Tianjiao Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Y">Yunsong Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08247v2-abstract-short" style="display: inline;"> Ring artifacts in computed tomography images, arising from the undesirable responses of detector units, significantly degrade image quality and diagnostic reliability. To address this challenge, we propose a dual-domain regularization model to effectively remove ring artifacts, while maintaining the integrity of the original CT image. The proposed model corrects the vertical stripe artifacts on th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08247v2-abstract-full').style.display = 'inline'; document.getElementById('2403.08247v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08247v2-abstract-full" style="display: none;"> Ring artifacts in computed tomography images, arising from the undesirable responses of detector units, significantly degrade image quality and diagnostic reliability. To address this challenge, we propose a dual-domain regularization model to effectively remove ring artifacts, while maintaining the integrity of the original CT image. The proposed model corrects the vertical stripe artifacts on the sinogram by innovatively updating the response inconsistency compensation coefficients of detector units, which is achieved by employing the group sparse constraint and the projection-view direction sparse constraint on the stripe artifacts. Simultaneously, we apply the sparse constraint on the reconstructed image to further rectified ring artifacts in the image domain. The key advantage of the proposed method lies in considering the relationship between the response inconsistency compensation coefficients of the detector units and the projection views, which enables a more accurate correction of the response of the detector units. An alternating minimization method is designed to solve the model. Comparative experiments on real photon counting detector data demonstrate that the proposed method not only surpasses existing methods in removing ring artifacts but also excels in preserving structural details and image fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08247v2-abstract-full').style.display = 'none'; document.getElementById('2403.08247v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04351">arXiv:2401.04351</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.04351">pdf</a>, <a href="https://arxiv.org/format/2401.04351">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.conengprac.2023.105840">10.1016/j.conengprac.2023.105840 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Change Point Detection Integrated Remaining Useful Life Estimation Model under Variable Operating Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Arunan%2C+A">Anushiya Arunan</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yan Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaoli Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04351v1-abstract-short" style="display: inline;"> By informing the onset of the degradation process, health status evaluation serves as a significant preliminary step for reliable remaining useful life (RUL) estimation of complex equipment. This paper proposes a novel temporal dynamics learning-based model for detecting change points of individual devices, even under variable operating conditions, and utilises the learnt change points to improve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04351v1-abstract-full').style.display = 'inline'; document.getElementById('2401.04351v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04351v1-abstract-full" style="display: none;"> By informing the onset of the degradation process, health status evaluation serves as a significant preliminary step for reliable remaining useful life (RUL) estimation of complex equipment. This paper proposes a novel temporal dynamics learning-based model for detecting change points of individual devices, even under variable operating conditions, and utilises the learnt change points to improve the RUL estimation accuracy. During offline model development, the multivariate sensor data are decomposed to learn fused temporal correlation features that are generalisable and representative of normal operation dynamics across multiple operating conditions. Monitoring statistics and control limit thresholds for normal behaviour are dynamically constructed from these learnt temporal features for the unsupervised detection of device-level change points. The detected change points then inform the degradation data labelling for training a long short-term memory (LSTM)-based RUL estimation model. During online monitoring, the temporal correlation dynamics of a query device is monitored for breach of the control limit derived in offline training. If a change point is detected, the device&#39;s RUL is estimated with the well-trained offline model for early preventive action. Using C-MAPSS turbofan engines as the case study, the proposed method improved the accuracy by 5.6\% and 7.5\% for two scenarios with six operating conditions, when compared to existing LSTM-based RUL estimation models that do not consider heterogeneous change points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04351v1-abstract-full').style.display = 'none'; document.getElementById('2401.04351v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in Control Engineering Practice Journal with DOI: https://doi.org/10.1016/j.conengprac.2023.105840</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13567">arXiv:2312.13567</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.13567">pdf</a>, <a href="https://arxiv.org/format/2312.13567">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Fine-grained Disentangled Representation Learning for Multimodal Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13567v1-abstract-short" style="display: inline;"> Multimodal emotion recognition (MMER) is an active research field that aims to accurately recognize human emotions by fusing multiple perceptual modalities. However, inherent heterogeneity across modalities introduces distribution gaps and information redundancy, posing significant challenges for MMER. In this paper, we propose a novel fine-grained disentangled representation learning (FDRL) frame&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13567v1-abstract-full').style.display = 'inline'; document.getElementById('2312.13567v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13567v1-abstract-full" style="display: none;"> Multimodal emotion recognition (MMER) is an active research field that aims to accurately recognize human emotions by fusing multiple perceptual modalities. However, inherent heterogeneity across modalities introduces distribution gaps and information redundancy, posing significant challenges for MMER. In this paper, we propose a novel fine-grained disentangled representation learning (FDRL) framework to address these challenges. Specifically, we design modality-shared and modality-private encoders to project each modality into modality-shared and modality-private subspaces, respectively. In the shared subspace, we introduce a fine-grained alignment component to learn modality-shared representations, thus capturing modal consistency. Subsequently, we tailor a fine-grained disparity component to constrain the private subspaces, thereby learning modality-private representations and enhancing their diversity. Lastly, we introduce a fine-grained predictor component to ensure that the labels of the output representations from the encoders remain unchanged. Experimental results on the IEMOCAP dataset show that FDRL outperforms the state-of-the-art methods, achieving 78.34% and 79.44% on WAR and UAR, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13567v1-abstract-full').style.display = 'none'; document.getElementById('2312.13567v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13560">arXiv:2312.13560</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.13560">pdf</a>, <a href="https://arxiv.org/format/2312.13560">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> kNN-CTC: Enhancing ASR via Retrieval of CTC Pseudo Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yaqi Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13560v2-abstract-short" style="display: inline;"> The success of retrieval-augmented language models in various natural language processing (NLP) tasks has been constrained in automatic speech recognition (ASR) applications due to challenges in constructing fine-grained audio-text datastores. This paper presents kNN-CTC, a novel approach that overcomes these challenges by leveraging Connectionist Temporal Classification (CTC) pseudo labels to est&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13560v2-abstract-full').style.display = 'inline'; document.getElementById('2312.13560v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13560v2-abstract-full" style="display: none;"> The success of retrieval-augmented language models in various natural language processing (NLP) tasks has been constrained in automatic speech recognition (ASR) applications due to challenges in constructing fine-grained audio-text datastores. This paper presents kNN-CTC, a novel approach that overcomes these challenges by leveraging Connectionist Temporal Classification (CTC) pseudo labels to establish frame-level audio-text key-value pairs, circumventing the need for precise ground truth alignments. We further introduce a skip-blank strategy, which strategically ignores CTC blank frames, to reduce datastore size. kNN-CTC incorporates a k-nearest neighbors retrieval mechanism into pre-trained CTC ASR systems, achieving significant improvements in performance. By incorporating a k-nearest neighbors retrieval mechanism into pre-trained CTC ASR systems and leveraging a fine-grained, pruned datastore, kNN-CTC consistently achieves substantial improvements in performance under various experimental settings. Our code is available at https://github.com/NKU-HLT/KNN-CTC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13560v2-abstract-full').style.display = 'none'; document.getElementById('2312.13560v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11536">arXiv:2312.11536</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.11536">pdf</a>, <a href="https://arxiv.org/format/2312.11536">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fast Decision Boundary based Out-of-Distribution Detector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Litian Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yao Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11536v2-abstract-short" style="display: inline;"> Efficient and effective Out-of-Distribution (OOD) detection is essential for the safe deployment of AI systems. Existing feature space methods, while effective, often incur significant computational overhead due to their reliance on auxiliary models built from training features. In this paper, we propose a computationally-efficient OOD detector without using auxiliary models while still leveraging&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11536v2-abstract-full').style.display = 'inline'; document.getElementById('2312.11536v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11536v2-abstract-full" style="display: none;"> Efficient and effective Out-of-Distribution (OOD) detection is essential for the safe deployment of AI systems. Existing feature space methods, while effective, often incur significant computational overhead due to their reliance on auxiliary models built from training features. In this paper, we propose a computationally-efficient OOD detector without using auxiliary models while still leveraging the rich information embedded in the feature space. Specifically, we detect OOD samples based on their feature distances to decision boundaries. To minimize computational cost, we introduce an efficient closed-form estimation, analytically proven to tightly lower bound the distance. Based on our estimation, we discover that In-Distribution (ID) features tend to be further from decision boundaries than OOD features. Additionally, ID and OOD samples are better separated when compared at equal deviation levels from the mean of training features. By regularizing the distances to decision boundaries based on feature deviation from the mean, we develop a hyperparameter-free, auxiliary model-free OOD detector. Our method matches or surpasses the effectiveness of state-of-the-art methods in extensive experiments while incurring negligible overhead in inference latency. Overall, our approach significantly improves the efficiency-effectiveness trade-off in OOD detection. Code is available at: https://github.com/litianliu/fDBD-OOD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11536v2-abstract-full').style.display = 'none'; document.getElementById('2312.11536v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024 main conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10344">arXiv:2312.10344</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.10344">pdf</a>, <a href="https://arxiv.org/format/2312.10344">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Passive and Active EMF Exposure in Large-Scale Cellular Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Elzanaty%2C+A">Ahmed Elzanaty</a>, <a href="/search/eess?searchtype=author&amp;query=Chiaraviglio%2C+L">Luca Chiaraviglio</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10344v1-abstract-short" style="display: inline;"> With the development of fifth-generation (5G) networks, the number of user equipments (UE) increases dramatically. However, the potential health risks from electromagnetic fields (EMF) tend to be a public concern. Generally, EMF exposure-related analysis mainly considers the passive exposure from base stations (BSs) and active exposure that results from the user&#39;s personal devices while communicat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10344v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10344v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10344v1-abstract-full" style="display: none;"> With the development of fifth-generation (5G) networks, the number of user equipments (UE) increases dramatically. However, the potential health risks from electromagnetic fields (EMF) tend to be a public concern. Generally, EMF exposure-related analysis mainly considers the passive exposure from base stations (BSs) and active exposure that results from the user&#39;s personal devices while communicating. However, the passive radiation that is generated by nearby devices of other users is typically ignored. In fact, with the increase in the density of UE, their passive exposure to human bodies can no longer be ignored. In this work, we propose a stochastic geometry framework to analyze the EMF exposure from active and passive radiation sources. In particular, considering a typical user, we account for their exposure to EMF from BSs, their own UE, and other UE. We derive the distribution of the Exposure index (EI) and the coverage probability for two typical models for spatial distributions of UE, i.e., \textit{i)} a Poisson point process (PPP); \textit{ii)} a Matern cluster process. Also, we show the trade-off between the EMF exposure and the coverage probability. Our numerical results suggest that the passive exposure from other users is non-negligible compared to the exposure from BSs when user density is $10^2$ times higher than BS density, and non-negligible compared to active exposure from the user&#39;s own UE when user density is $10^5$ times the BS density. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10344v1-abstract-full').style.display = 'none'; document.getElementById('2312.10344v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00535">arXiv:2312.00535</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.00535">pdf</a>, <a href="https://arxiv.org/format/2312.00535">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RIS-Based On-the-Air Semantic Communications -- a Diffractional Deep Neural Network Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Shuyi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Hui%2C+Y">Yingzhe Hui</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yifan Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+Y">Yueyi Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+W">Weixiao Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+X">Xuewen Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hsiao-Hwa Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00535v1-abstract-short" style="display: inline;"> Semantic communication has gained significant attention recently due to its advantages in achieving higher transmission efficiency by focusing on semantic information instead of bit-level information. However, current AI-based semantic communication methods require digital hardware for implementation. With the rapid advancement on reconfigurable intelligence surfaces (RISs), a new approach called&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00535v1-abstract-full').style.display = 'inline'; document.getElementById('2312.00535v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00535v1-abstract-full" style="display: none;"> Semantic communication has gained significant attention recently due to its advantages in achieving higher transmission efficiency by focusing on semantic information instead of bit-level information. However, current AI-based semantic communication methods require digital hardware for implementation. With the rapid advancement on reconfigurable intelligence surfaces (RISs), a new approach called on-the-air diffractional deep neural networks (D$^2$NN) can be utilized to enable semantic communications on the wave domain. This paper proposes a new paradigm of RIS-based on-the-air semantic communications, where the computational process occurs inherently as wireless signals pass through RISs. We present the system model and discuss the data and control flows of this scheme, followed by a performance analysis using image transmission as an example. In comparison to traditional hardware-based approaches, RIS-based semantic communications offer appealing features, such as light-speed computation, low computational power requirements, and the ability to handle multiple tasks simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00535v1-abstract-full').style.display = 'none'; document.getElementById('2312.00535v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures, accepted by IEEE WCM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08715">arXiv:2311.08715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.08715">pdf</a>, <a href="https://arxiv.org/format/2311.08715">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TVT.2023.3323682">10.1109/TVT.2023.3323682 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Stochastic Geometry-based Trajectory Design for Multi-Purpose UAVs: Package and Data Delivery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08715v1-abstract-short" style="display: inline;"> With the advancements achieved in drones&#39; flexibility, low cost, and high efficiency, they obtain huge application opportunities in various industries, such as aerial delivery and future communication networks. However, the increasing transportation needs and expansion of network capacity demands for UAVs will cause aerial traffic conflicts in the future. To address this issue, in this paper, we e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08715v1-abstract-full').style.display = 'inline'; document.getElementById('2311.08715v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08715v1-abstract-full" style="display: none;"> With the advancements achieved in drones&#39; flexibility, low cost, and high efficiency, they obtain huge application opportunities in various industries, such as aerial delivery and future communication networks. However, the increasing transportation needs and expansion of network capacity demands for UAVs will cause aerial traffic conflicts in the future. To address this issue, in this paper, we explore the idea of multi-purpose UAVs, which act as aerial wireless communication data relays and means of aerial transportation simultaneously to deliver data and packages at the same time. While UAVs deliver the packages from warehouses to residential areas, we design their trajectories which enable them to collect data from multiple Internet of Things (IoT) clusters and forward the collected data to terrestrial base stations (TBSs). To select the serving nearby IoT clusters, UAVs rank them based on their priorities and distances. From the perspectives of data and package delivery, respectively, we propose two algorithms that design the optimal UAVs trajectory to maximize the transmitted data or minimize the round trip time. Specifically, we use tools from stochastic geometry to model the locations of IoT clusters and TBSs. Given the nature of random locations, the proposed algorithm applies to general cases. Our numerical results show that multi-purpose UAVs are practical and have great potential to enhance the energy/time-efficiency of future networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08715v1-abstract-full').style.display = 'none'; document.getElementById('2311.08715v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 10.1109/TVT.2023.3323682 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.01479">arXiv:2311.01479</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.01479">pdf</a>, <a href="https://arxiv.org/format/2311.01479">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Detecting Out-of-Distribution Through the Lens of Neural Collapse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Litian Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yao Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.01479v6-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is essential for safe deployment; however, existing detectors exhibit generalization discrepancies and cost concerns. To address this, we propose a highly versatile and efficient OOD detector inspired by the trend of Neural Collapse on practical models, without requiring complete collapse. By analyzing this trend, we discover that features of in-distribution (ID&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01479v6-abstract-full').style.display = 'inline'; document.getElementById('2311.01479v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.01479v6-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is essential for safe deployment; however, existing detectors exhibit generalization discrepancies and cost concerns. To address this, we propose a highly versatile and efficient OOD detector inspired by the trend of Neural Collapse on practical models, without requiring complete collapse. By analyzing this trend, we discover that features of in-distribution (ID) samples cluster closer to the weight vectors compared to features of OOD samples. Additionally, we reveal that ID features tend to expand in space to structure a simplex Equiangular Tight Framework, which explains the prevalent observation that ID features reside further from the origin than OOD features. Taking both insights from Neural Collapse into consideration, our OOD detector utilizes feature proximity to weight vectors and further complements this perspective by using feature norms to filter OOD samples. Extensive experiments on off-the-shelf models demonstrate the efficiency and effectiveness of our OOD detector across diverse classification tasks and model architectures, mitigating generalization discrepancies and improving overall performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01479v6-abstract-full').style.display = 'none'; document.getElementById('2311.01479v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.01047">arXiv:2311.01047</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.01047">pdf</a>, <a href="https://arxiv.org/format/2311.01047">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Improving Robustness via Tilted Exponential Layer: A Communication-Theoretic Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Puranik%2C+B">Bhagyashree Puranik</a>, <a href="/search/eess?searchtype=author&amp;query=Beirami%2C+A">Ahmad Beirami</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yao Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Madhow%2C+U">Upamanyu Madhow</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.01047v3-abstract-short" style="display: inline;"> State-of-the-art techniques for enhancing robustness of deep networks mostly rely on empirical risk minimization with suitable data augmentation. In this paper, we propose a complementary approach motivated by communication theory, aimed at enhancing the signal-to-noise ratio at the output of a neural network layer via neural competition during learning and inference. In addition to standard empir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01047v3-abstract-full').style.display = 'inline'; document.getElementById('2311.01047v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.01047v3-abstract-full" style="display: none;"> State-of-the-art techniques for enhancing robustness of deep networks mostly rely on empirical risk minimization with suitable data augmentation. In this paper, we propose a complementary approach motivated by communication theory, aimed at enhancing the signal-to-noise ratio at the output of a neural network layer via neural competition during learning and inference. In addition to standard empirical risk minimization, neurons compete to sparsely represent layer inputs by maximization of a tilted exponential (TEXP) objective function for the layer. TEXP learning can be interpreted as maximum likelihood estimation of matched filters under a Gaussian model for data noise. Inference in a TEXP layer is accomplished by replacing batch norm by a tilted softmax, which can be interpreted as computation of posterior probabilities for the competing signaling hypotheses represented by each neuron. After providing insights via simplified models, we show, by experimentation on standard image datasets, that TEXP learning and inference enhances robustness against noise and other common corruptions, without requiring data augmentation. Further cumulative gains in robustness against this array of distortions can be obtained by appropriately combining TEXP with data augmentation techniques. The code for all our experiments is available at https://github.com/bhagyapuranik/texp_for_robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01047v3-abstract-full').style.display = 'none'; document.getElementById('2311.01047v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27th International Conference on Artificial Intelligence and Statistics (AISTATS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.06366">arXiv:2310.06366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.06366">pdf</a>, <a href="https://arxiv.org/format/2310.06366">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> On the Peak AoI of UAV-assisted IoT Networks: A Stochastic Geometry Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.06366v1-abstract-short" style="display: inline;"> In this paper, we analyze the peak age of information (PAoI) in UAV-assisted internet of thing (IoT) networks, in which the locations of IoT devices are modeled by a Mat茅rn cluster process (MCP) and UAVs are deployed at the cluster centers to collect the status updates from the devices. Specifically, we consider that IoT devices can either monitor the same physical process or different physical pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06366v1-abstract-full').style.display = 'inline'; document.getElementById('2310.06366v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.06366v1-abstract-full" style="display: none;"> In this paper, we analyze the peak age of information (PAoI) in UAV-assisted internet of thing (IoT) networks, in which the locations of IoT devices are modeled by a Mat茅rn cluster process (MCP) and UAVs are deployed at the cluster centers to collect the status updates from the devices. Specifically, we consider that IoT devices can either monitor the same physical process or different physical processes and UAVs split their resources, time or bandwidth, to serve the devices to avoid inter-cluster interference. Using tools from stochastic geometry, we are able to compute the mean activity probability of IoT devices and the conditional success probability of an individual device. We then use tools from queuing theory to compute the PAoI under two load models and two scenarios for devices, respectively. Our numerical results show interesting system insights. We first show that for a low data arrival rate, increasing the number of correlated devices can improve the PAoI for both load models. Next, we show that even though the time-splitting technique causes higher interference, it has a limited impact on the mean PAoI, and the mean PAoI benefits more from the time-splitting technique. This is because of the nature of UAV communication, especially at places where devices (users) are spatially-clustered: shorter transmission distances and better communication channels, comparing the links established by the cluster UAV and serving devices (users) to links established by interferers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06366v1-abstract-full').style.display = 'none'; document.getElementById('2310.06366v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11756">arXiv:2309.11756</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.11756">pdf</a>, <a href="https://arxiv.org/format/2309.11756">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sparsely Shared LoRA on Whisper for Child Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+W">Wei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Ying Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+T">Tan Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11756v2-abstract-short" style="display: inline;"> Whisper is a powerful automatic speech recognition (ASR) model. Nevertheless, its zero-shot performance on low-resource speech requires further improvement. Child speech, as a representative type of low-resource speech, is leveraged for adaptation. Recently, parameter-efficient fine-tuning (PEFT) in NLP was shown to be comparable and even better than full fine-tuning, while only needing to tune a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11756v2-abstract-full').style.display = 'inline'; document.getElementById('2309.11756v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11756v2-abstract-full" style="display: none;"> Whisper is a powerful automatic speech recognition (ASR) model. Nevertheless, its zero-shot performance on low-resource speech requires further improvement. Child speech, as a representative type of low-resource speech, is leveraged for adaptation. Recently, parameter-efficient fine-tuning (PEFT) in NLP was shown to be comparable and even better than full fine-tuning, while only needing to tune a small set of trainable parameters. However, current PEFT methods have not been well examined for their effectiveness on Whisper. In this paper, only parameter composition types of PEFT approaches such as LoRA and Bitfit are investigated as they do not bring extra inference costs. Different popular PEFT methods are examined. Particularly, we compare LoRA and AdaLoRA and figure out the learnable rank coefficient is a good design. Inspired by the sparse rank distribution allocated by AdaLoRA, a novel PEFT approach Sparsely Shared LoRA (S2-LoRA) is proposed. The two low-rank decomposed matrices are globally shared. Each weight matrix only has to maintain its specific rank coefficients that are constrained to be sparse. Experiments on low-resource Chinese child speech show that with much fewer trainable parameters, S2-LoRA can achieve comparable in-domain adaptation performance to AdaLoRA and exhibit better generalization ability on out-of-domain data. In addition, the rank distribution automatically learned by S2-LoRA is found to have similar patterns to AdaLoRA&#39;s allocation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11756v2-abstract-full').style.display = 'none'; document.getElementById('2309.11756v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.04950">arXiv:2309.04950</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.04950">pdf</a>, <a href="https://arxiv.org/format/2309.04950">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Dominant Interferer-based Approximation for Uplink SINR Meta Distribution in Cellular Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.04950v1-abstract-short" style="display: inline;"> This work studies the signal-to-interference-plus-noise-ratio (SINR) meta distribution for the uplink transmission of a Poisson network with Rayleigh fading by using the dominant interferer-based approximation. The proposed approach relies on computing the mix of exact and mean-field analysis of interference. In particular, it requires the distance distribution of the nearest interferer and the co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.04950v1-abstract-full').style.display = 'inline'; document.getElementById('2309.04950v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.04950v1-abstract-full" style="display: none;"> This work studies the signal-to-interference-plus-noise-ratio (SINR) meta distribution for the uplink transmission of a Poisson network with Rayleigh fading by using the dominant interferer-based approximation. The proposed approach relies on computing the mix of exact and mean-field analysis of interference. In particular, it requires the distance distribution of the nearest interferer and the conditional average of the rest of the interference. Using the widely studied fractional path-loss inversion power control and modeling the spatial locations of base stations (BSs) by a Poisson point process (PPP), we obtain the meta distribution based on the proposed method and compare it with the traditional beta approximation, as well as the exact results obtained via Monte-Carlo simulations. Our numerical results validate that the proposed method shows good matching and is time competitive. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.04950v1-abstract-full').style.display = 'none'; document.getElementById('2309.04950v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2302.03574</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.16488">arXiv:2308.16488</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.16488">pdf</a>, <a href="https://arxiv.org/format/2308.16488">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-851">10.21437/Interspeech.2023-851 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RAMP: Retrieval-Augmented MOS Prediction via Confidence-based Dynamic Weighting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+X">Xiguang Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.16488v1-abstract-short" style="display: inline;"> Automatic Mean Opinion Score (MOS) prediction is crucial to evaluate the perceptual quality of the synthetic speech. While recent approaches using pre-trained self-supervised learning (SSL) models have shown promising results, they only partly address the data scarcity issue for the feature extractor. This leaves the data scarcity issue for the decoder unresolved and leading to suboptimal performa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16488v1-abstract-full').style.display = 'inline'; document.getElementById('2308.16488v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.16488v1-abstract-full" style="display: none;"> Automatic Mean Opinion Score (MOS) prediction is crucial to evaluate the perceptual quality of the synthetic speech. While recent approaches using pre-trained self-supervised learning (SSL) models have shown promising results, they only partly address the data scarcity issue for the feature extractor. This leaves the data scarcity issue for the decoder unresolved and leading to suboptimal performance. To address this challenge, we propose a retrieval-augmented MOS prediction method, dubbed {\bf RAMP}, to enhance the decoder&#39;s ability against the data scarcity issue. A fusing network is also proposed to dynamically adjust the retrieval scope for each instance and the fusion weights based on the predictive confidence. Experimental results show that our proposed method outperforms the existing methods in multiple scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16488v1-abstract-full').style.display = 'none'; document.getElementById('2308.16488v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2023, oral</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> INTERSPEECH 2023, 1095-1099 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.16485">arXiv:2308.16485</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.16485">pdf</a>, <a href="https://arxiv.org/format/2308.16485">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-842">10.21437/Interspeech.2023-842 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Supervised Contrastive Learning with Nearest Neighbor Search for Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.16485v1-abstract-short" style="display: inline;"> Speech Emotion Recognition (SER) is a challenging task due to limited data and blurred boundaries of certain emotions. In this paper, we present a comprehensive approach to improve the SER performance throughout the model lifecycle, including pre-training, fine-tuning, and inference stages. To address the data scarcity issue, we utilize a pre-trained model, wav2vec2.0. During fine-tuning, we propo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16485v1-abstract-full').style.display = 'inline'; document.getElementById('2308.16485v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.16485v1-abstract-full" style="display: none;"> Speech Emotion Recognition (SER) is a challenging task due to limited data and blurred boundaries of certain emotions. In this paper, we present a comprehensive approach to improve the SER performance throughout the model lifecycle, including pre-training, fine-tuning, and inference stages. To address the data scarcity issue, we utilize a pre-trained model, wav2vec2.0. During fine-tuning, we propose a novel loss function that combines cross-entropy loss with supervised contrastive learning loss to improve the model&#39;s discriminative ability. This approach increases the inter-class distances and decreases the intra-class distances, mitigating the issue of blurred boundaries. Finally, to leverage the improved distances, we propose an interpolation method at the inference stage that combines the model prediction with the output from a k-nearest neighbors model. Our experiments on IEMOCAP demonstrate that our proposed methods outperform current state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16485v1-abstract-full').style.display = 'none'; document.getElementById('2308.16485v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by lnterspeech 2023, poster</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> INTERSPEECH 2023, 1913-1917 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09412">arXiv:2308.09412</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.09412">pdf</a>, <a href="https://arxiv.org/format/2308.09412">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Causal SAR ATR with Limited Data via Dual Invariance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenwei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">You Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Li Li</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+S">Siyi Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yulin Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Pei%2C+J">Jifang Pei</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+J">Jianyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09412v2-abstract-short" style="display: inline;"> Synthetic aperture radar automatic target recognition (SAR ATR) with limited data has recently been a hot research topic to enhance weak generalization. Despite many excellent methods being proposed, a fundamental theory is lacked to explain what problem the limited SAR data causes, leading to weak generalization of ATR. In this paper, we establish a causal ATR model demonstrating that noise $N$ t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09412v2-abstract-full').style.display = 'inline'; document.getElementById('2308.09412v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09412v2-abstract-full" style="display: none;"> Synthetic aperture radar automatic target recognition (SAR ATR) with limited data has recently been a hot research topic to enhance weak generalization. Despite many excellent methods being proposed, a fundamental theory is lacked to explain what problem the limited SAR data causes, leading to weak generalization of ATR. In this paper, we establish a causal ATR model demonstrating that noise $N$ that could be blocked with ample SAR data, becomes a confounder with limited data for recognition. As a result, it has a detrimental causal effect damaging the efficacy of feature $X$ extracted from SAR images, leading to weak generalization of SAR ATR with limited data. The effect of $N$ on feature can be estimated and eliminated by using backdoor adjustment to pursue the direct causality between $X$ and the predicted class $Y$. However, it is difficult for SAR images to precisely estimate and eliminated the effect of $N$ on $X$. The limited SAR data scarcely powers the majority of existing optimization losses based on empirical risk minimization (ERM), thus making it difficult to effectively eliminate $N$&#39;s effect. To tackle with difficult estimation and elimination of $N$&#39;s effect, we propose a dual invariance comprising the inner-class invariant proxy and the noise-invariance loss. Motivated by tackling change with invariance, the inner-class invariant proxy facilitates precise estimation of $N$&#39;s effect on $X$ by obtaining accurate invariant features for each class with the limited data. The noise-invariance loss transitions the ERM&#39;s data quantity necessity into a need for noise environment annotations, effectively eliminating $N$&#39;s effect on $X$ by cleverly applying the previous $N$&#39;s estimation as the noise environment annotations. Experiments on three benchmark datasets indicate that the proposed method achieves superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09412v2-abstract-full').style.display = 'none'; document.getElementById('2308.09412v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09396">arXiv:2308.09396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.09396">pdf</a>, <a href="https://arxiv.org/format/2308.09396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Causalities in SAR ATR: A Causal Interventional Approach for Limited Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenwei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">You Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+S">Siyi Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yulin Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Pei%2C+J">Jifang Pei</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+J">Jianyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09396v1-abstract-short" style="display: inline;"> Synthetic aperture radar automatic target recognition (SAR ATR) methods fall short with limited training data. In this letter, we propose a causal interventional ATR method (CIATR) to formulate the problem of limited SAR data which helps us uncover the ever-elusive causalities among the key factors in ATR, and thus pursue the desired causal effect without changing the imaging conditions. A structu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09396v1-abstract-full').style.display = 'inline'; document.getElementById('2308.09396v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09396v1-abstract-full" style="display: none;"> Synthetic aperture radar automatic target recognition (SAR ATR) methods fall short with limited training data. In this letter, we propose a causal interventional ATR method (CIATR) to formulate the problem of limited SAR data which helps us uncover the ever-elusive causalities among the key factors in ATR, and thus pursue the desired causal effect without changing the imaging conditions. A structural causal model (SCM) is comprised using causal inference to help understand how imaging conditions acts as a confounder introducing spurious correlation when SAR data is limited. This spurious correlation among SAR images and the predicted classes can be fundamentally tackled with the conventional backdoor adjustments. An effective implement of backdoor adjustments is proposed by firstly using data augmentation with spatial-frequency domain hybrid transformation to estimate the potential effect of varying imaging conditions on SAR images. Then, a feature discrimination approach with hybrid similarity measurement is introduced to measure and mitigate the structural and vector angle impacts of varying imaging conditions on the extracted features from SAR images. Thus, our CIATR can pursue the true causality between SAR images and the corresponding classes even with limited SAR data. Experiments and comparisons conducted on the moving and stationary target acquisition and recognition (MSTAR) and OpenSARship datasets have shown the effectiveness of our method with limited SAR data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09396v1-abstract-full').style.display = 'none'; document.getElementById('2308.09396v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.07302">arXiv:2308.07302</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.07302">pdf</a>, <a href="https://arxiv.org/format/2308.07302">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Vibrational Stabilization of Cluster Synchronization in Oscillator Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yuzhen Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Nobili%2C+A+M">Alberto Maria Nobili</a>, <a href="/search/eess?searchtype=author&amp;query=Bassett%2C+D+S">Danielle S. Bassett</a>, <a href="/search/eess?searchtype=author&amp;query=Pasqualetti%2C+F">Fabio Pasqualetti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.07302v1-abstract-short" style="display: inline;"> Cluster synchronization is of paramount importance for the normal functioning of numerous technological and natural systems. Deviations from normal cluster synchronization patterns are closely associated with various malfunctions, such as neurological disorders in the brain. Therefore, it is crucial to restore normal system functions by stabilizing the appropriate cluster synchronization patterns.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07302v1-abstract-full').style.display = 'inline'; document.getElementById('2308.07302v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.07302v1-abstract-full" style="display: none;"> Cluster synchronization is of paramount importance for the normal functioning of numerous technological and natural systems. Deviations from normal cluster synchronization patterns are closely associated with various malfunctions, such as neurological disorders in the brain. Therefore, it is crucial to restore normal system functions by stabilizing the appropriate cluster synchronization patterns. Most existing studies focus on designing controllers based on state measurements to achieve system stabilization. However, in many real-world scenarios, measuring system states, such as neuronal activity in the brain, poses significant challenges, rendering the stabilization of such systems difficult. To overcome this challenge, in this paper, we employ an open-loop control strategy, vibrational control, which does not requires any state measurements. We establish some sufficient conditions under which vibrational inputs stabilize cluster synchronization. Further, we provide a tractable approach to design vibrational control. Finally, numerical experiments are conducted to demonstrate our theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07302v1-abstract-full').style.display = 'none'; document.getElementById('2308.07302v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Open Journal of Control Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05823">arXiv:2308.05823</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.05823">pdf</a>, <a href="https://arxiv.org/format/2308.05823">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.23919/ACC55779.2023.10156032">10.23919/ACC55779.2023.10156032 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Vibrational Stabilization of Complex Network Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Nobili%2C+A+M">Alberto Maria Nobili</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yuzhen Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Avizzano%2C+C+A">Carlo Alberto Avizzano</a>, <a href="/search/eess?searchtype=author&amp;query=Bassett%2C+D+S">Danielle S. Bassett</a>, <a href="/search/eess?searchtype=author&amp;query=Pasqualetti%2C+F">Fabio Pasqualetti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05823v1-abstract-short" style="display: inline;"> Many natural and man-made network systems need to maintain certain patterns, such as working at equilibria or limit cycles, to function properly. Thus, the ability to stabilize such patterns is crucial. Most of the existing studies on stabilization assume that network systems states can be measured online so that feedback control strategies can be used. However, in many real-world scenarios, syste&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05823v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05823v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05823v1-abstract-full" style="display: none;"> Many natural and man-made network systems need to maintain certain patterns, such as working at equilibria or limit cycles, to function properly. Thus, the ability to stabilize such patterns is crucial. Most of the existing studies on stabilization assume that network systems states can be measured online so that feedback control strategies can be used. However, in many real-world scenarios, systems states, e.g., neuronal activity in the brain, are often difficult to measure. In this paper, we take this situation into account and study the stabilization problem of linear network systems with an open-loop control strategy (vibrational control). We derive a graph-theoretic sufficient condition for structural vibrational stabilizability, under which network systems can always be stabilized. We further provide an approach to select the locations in the network for control placement and design corresponding vibrational inputs to stabilize systems that satisfy this condition. Finally, we provide some numerical results that demonstrate the validity of our theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05823v1-abstract-full').style.display = 'none'; document.getElementById('2308.05823v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 2022 American Control Conference, San Diego, May, 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.05084">arXiv:2304.05084</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.05084">pdf</a>, <a href="https://arxiv.org/format/2304.05084">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Self-attention Knowledge Domain Adaptation Network for Commercial Lithium-ion Batteries State-of-health Estimation under Shallow Cycles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yuwen Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+W">Weidong Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Q">Qiming Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+N">Ningbo Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+K">Kai Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.05084v1-abstract-short" style="display: inline;"> Accurate state-of-health (SOH) estimation is critical to guarantee the safety, efficiency and reliability of battery-powered applications. Most SOH estimation methods focus on the 0-100\% full state-of-charge (SOC) range that has similar distributions. However, the batteries in real-world applications usually work in the partial SOC range under shallow-cycle conditions and follow different degrada&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05084v1-abstract-full').style.display = 'inline'; document.getElementById('2304.05084v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.05084v1-abstract-full" style="display: none;"> Accurate state-of-health (SOH) estimation is critical to guarantee the safety, efficiency and reliability of battery-powered applications. Most SOH estimation methods focus on the 0-100\% full state-of-charge (SOC) range that has similar distributions. However, the batteries in real-world applications usually work in the partial SOC range under shallow-cycle conditions and follow different degradation profiles with no labeled data available, thus making SOH estimation challenging. To estimate shallow-cycle battery SOH, a novel unsupervised deep transfer learning method is proposed to bridge different domains using self-attention distillation module and multi-kernel maximum mean discrepancy technique. The proposed method automatically extracts domain-variant features from charge curves to transfer knowledge from the large-scale labeled full cycles to the unlabeled shallow cycles. The CALCE and SNL battery datasets are employed to verify the effectiveness of the proposed method to estimate the battery SOH for different SOC ranges, temperatures, and discharge rates. The proposed method achieves a root-mean-square error within 2\% and outperforms other transfer learning methods for different SOC ranges. When applied to batteries with different operating conditions and from different manufacturers, the proposed method still exhibits superior SOH estimation performance. The proposed method is the first attempt at accurately estimating battery SOH under shallow-cycle conditions without needing a full-cycle characteristic test. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05084v1-abstract-full').style.display = 'none'; document.getElementById('2304.05084v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.03026">arXiv:2304.03026</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.03026">pdf</a>, <a href="https://arxiv.org/format/2304.03026">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Coverage Analysis and Trajectory Optimization for Aerial Users with Dedicated Cellular Infrastructure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.03026v1-abstract-short" style="display: inline;"> In this paper, we consider a novel cellular network for aerial users, which is composed of dedicated base stations (BSs), whose antennas are directed towards aerial users, and traditional terrestrial BSs (TBSs). Besides, the dedicated BSs are deployed on roadside furniture, such as lampposts and traffic lights, to achieve multiple features while occupying less space. Therefore, the locations of de&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03026v1-abstract-full').style.display = 'inline'; document.getElementById('2304.03026v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.03026v1-abstract-full" style="display: none;"> In this paper, we consider a novel cellular network for aerial users, which is composed of dedicated base stations (BSs), whose antennas are directed towards aerial users, and traditional terrestrial BSs (TBSs). Besides, the dedicated BSs are deployed on roadside furniture, such as lampposts and traffic lights, to achieve multiple features while occupying less space. Therefore, the locations of dedicated BSs and TBSs are modeled by a Poisson-line-Cox-process (PLCP) and Poisson point process (PPP), respectively. For the proposed network, we first compute the aerial coverage probability and show that the deployment of dedicated BSs improves the coverage probability in both high dense areas and rural areas. We then consider a cellular-connected UAV that has a flying mission and optimize its trajectory to maximize the minimal achievable signal-to-interference-plus-noise ratio (SINR) (Max-Min SINR). To obtain the Max-Min SINR and minimal time trajectory that satisfies the Max-Min SINR, we proposed two algorithms that are practical in large-scale networks. Finally, our results show that the optimal density of dedicated BSs which maximizes Max-Min SINR decreases with the increase of the road densities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03026v1-abstract-full').style.display = 'none'; document.getElementById('2304.03026v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.05745">arXiv:2303.05745</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.05745">pdf</a>, <a href="https://arxiv.org/format/2303.05745">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-site, Multi-domain Airway Tree Modeling (ATM&#39;22): A Public Benchmark for Pulmonary Airway Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Y">Yangqian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hanxiao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yulei Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+H">Hao Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+W">Wen Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Arnold%2C+C">Corey Arnold</a>, <a href="/search/eess?searchtype=author&amp;query=Pei%2C+C">Chenhao Pei</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+P">Pengxin Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Nan%2C+Y">Yang Nan</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Guang Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Walsh%2C+S">Simon Walsh</a>, <a href="/search/eess?searchtype=author&amp;query=Marshall%2C+D+C">Dominic C. Marshall</a>, <a href="/search/eess?searchtype=author&amp;query=Komorowski%2C+M">Matthieu Komorowski</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+P">Puyang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+D">Dazhou Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+D">Dakai Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Y">Ya&#39;nan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shuiqing Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Chang%2C+R">Runsheng Chang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+B">Boyu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+X">Xing Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Qayyum%2C+A">Abdul Qayyum</a>, <a href="/search/eess?searchtype=author&amp;query=Mazher%2C+M">Moona Mazher</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+Q">Qi Su</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.05745v3-abstract-short" style="display: inline;"> Open international challenges are becoming the de facto standard for assessing computer vision and image analysis algorithms. In recent years, new methods have extended the reach of pulmonary airway segmentation that is closer to the limit of image resolution. Since EXACT&#39;09 pulmonary airway segmentation, limited effort has been directed to quantitative comparison of newly emerged algorithms drive&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05745v3-abstract-full').style.display = 'inline'; document.getElementById('2303.05745v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.05745v3-abstract-full" style="display: none;"> Open international challenges are becoming the de facto standard for assessing computer vision and image analysis algorithms. In recent years, new methods have extended the reach of pulmonary airway segmentation that is closer to the limit of image resolution. Since EXACT&#39;09 pulmonary airway segmentation, limited effort has been directed to quantitative comparison of newly emerged algorithms driven by the maturity of deep learning based approaches and clinical drive for resolving finer details of distal airways for early intervention of pulmonary diseases. Thus far, public annotated datasets are extremely limited, hindering the development of data-driven methods and detailed performance evaluation of new algorithms. To provide a benchmark for the medical imaging community, we organized the Multi-site, Multi-domain Airway Tree Modeling (ATM&#39;22), which was held as an official challenge event during the MICCAI 2022 conference. ATM&#39;22 provides large-scale CT scans with detailed pulmonary airway annotation, including 500 CT scans (300 for training, 50 for validation, and 150 for testing). The dataset was collected from different sites and it further included a portion of noisy COVID-19 CTs with ground-glass opacity and consolidation. Twenty-three teams participated in the entire phase of the challenge and the algorithms for the top ten teams are reviewed in this paper. Quantitative and qualitative results revealed that deep learning models embedded with the topological continuity enhancement achieved superior performance in general. ATM&#39;22 challenge holds as an open-call design, the training data and the gold standard evaluation are available upon successful registration via its homepage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05745v3-abstract-full').style.display = 'none'; document.getElementById('2303.05745v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 16 figures. Homepage: https://atm22.grand-challenge.org/. Submitted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.11224">arXiv:2302.11224</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.11224">pdf</a>, <a href="https://arxiv.org/format/2302.11224">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MADI: Inter-domain Matching and Intra-domain Discrimination for Cross-domain Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+N">Ning Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+G">Guoqing Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.11224v1-abstract-short" style="display: inline;"> End-to-end automatic speech recognition (ASR) usually suffers from performance degradation when applied to a new domain due to domain shift. Unsupervised domain adaptation (UDA) aims to improve the performance on the unlabeled target domain by transferring knowledge from the source to the target domain. To improve transferability, existing UDA approaches mainly focus on matching the distributions&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11224v1-abstract-full').style.display = 'inline'; document.getElementById('2302.11224v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.11224v1-abstract-full" style="display: none;"> End-to-end automatic speech recognition (ASR) usually suffers from performance degradation when applied to a new domain due to domain shift. Unsupervised domain adaptation (UDA) aims to improve the performance on the unlabeled target domain by transferring knowledge from the source to the target domain. To improve transferability, existing UDA approaches mainly focus on matching the distributions of the source and target domains globally and/or locally, while ignoring the model discriminability. In this paper, we propose a novel UDA approach for ASR via inter-domain MAtching and intra-domain DIscrimination (MADI), which improves the model transferability by fine-grained inter-domain matching and discriminability by intra-domain contrastive discrimination simultaneously. Evaluations on the Libri-Adapt dataset demonstrate the effectiveness of our approach. MADI reduces the relative word error rate (WER) on cross-device and cross-environment ASR by 17.7% and 22.8%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11224v1-abstract-full').style.display = 'none'; document.getElementById('2302.11224v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.03574">arXiv:2302.03574</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.03574">pdf</a>, <a href="https://arxiv.org/format/2302.03574">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Dominant Interferer plus Mean Field-based Approximation for SINR Meta Distribution in Wireless Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.03574v1-abstract-short" style="display: inline;"> This paper proposes a novel approach for computing the meta distribution of the signal-to-interference-plus-noise ratio (SINR) for the downlink transmission in a wireless network with Rayleigh fading. The novel approach relies on an approximation mix of exact and mean-field analysis of interference (dominant interferer-based approximation) to reduce the complexity of analysis and enhance tractabil&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03574v1-abstract-full').style.display = 'inline'; document.getElementById('2302.03574v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.03574v1-abstract-full" style="display: none;"> This paper proposes a novel approach for computing the meta distribution of the signal-to-interference-plus-noise ratio (SINR) for the downlink transmission in a wireless network with Rayleigh fading. The novel approach relies on an approximation mix of exact and mean-field analysis of interference (dominant interferer-based approximation) to reduce the complexity of analysis and enhance tractability. In particular, the proposed approximation omits the need to compute the first or the second moment of the SINR that is used in the beta approximation typically adopted in the literature but requires of computing the joint distance distributions. We first derive the proposed approximation based on a Poisson point process (PPP) network with a standard path-loss and Rayleigh fading and then illustrate its accuracy and operability in another four widely used point processes: Poisson bipolar network, Mat茅rn cluster process (MCP), $K$-tier PPP and Poisson line Cox process (PLCP). Specifically, we obtain the SINR meta distribution for PLCP networks for the first time. Even though the proposed approximation looks simple but it shows good matching in comparison to the popular beta approximation as well as the Monte-Carlo simulations, which opens the door to adopting this approximation in more advanced network architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03574v1-abstract-full').style.display = 'none'; document.getElementById('2302.03574v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.11200">arXiv:2301.11200</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.11200">pdf</a>, <a href="https://arxiv.org/format/2301.11200">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> On the Downlink SINR Meta Distribution of UAV-assisted Wireless Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yujie Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/eess?searchtype=author&amp;query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.11200v1-abstract-short" style="display: inline;"> The meta distribution of the signal-to-interference-plus-noise ratio (SINR) provides fine-grained information about each link&#39;s performance in a wireless system and the reliability of the whole network. While the UAV-enabled network has been studied extensively, most of the works focus on the spatial average performance, such as coverage probability, while SINR meta distribution has received less&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.11200v1-abstract-full').style.display = 'inline'; document.getElementById('2301.11200v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.11200v1-abstract-full" style="display: none;"> The meta distribution of the signal-to-interference-plus-noise ratio (SINR) provides fine-grained information about each link&#39;s performance in a wireless system and the reliability of the whole network. While the UAV-enabled network has been studied extensively, most of the works focus on the spatial average performance, such as coverage probability, while SINR meta distribution has received less attention. In this paper, we use the SINR meta distribution for the first time to systematically analyze the improvement and the influence of deploying UAVs on the reliability of a wireless network. We first derive the $b$-th moments of the conditional success probability of the UAV-enabled network and give the approximated expressions derived by Gil-Pelaez theorem and the beta approximation of the meta distribution. Our numerical results show that deploying UAVs in wireless networks in most cases can greatly improve the system reliability, which denotes the fraction of users achieving cellular coverage, especially for the spatially-clustered users. In addition, establishing LoS links is not always beneficial since it also increases the interference. For instance, with the increase of the SINR threshold, the system reliability of a high LoS probability environment decreases dramatically and it is even lower than a low LoS probability environment. We also show that in highrise urban areas, UAVs can help in establishing extremely reliable (very high SINR) links. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.11200v1-abstract-full').style.display = 'none'; document.getElementById('2301.11200v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qin%2C+Y&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10