CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 609 results for author: <span class="mathjax">Wang, W</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Wang%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18266">arXiv:2411.18266</a> <span> [<a href="https://arxiv.org/pdf/2411.18266">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Wearable intelligent throat enables natural speech in stroke patients with dysarthria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shuo Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Cong Li</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+W">Wentian Yi</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yuxuan Jin</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+X">Xiaoxue Zhai</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+S">Sixuan Lei</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Hongbei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Muzi Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hongyun Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+N">Ningli Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+J">Jin Cao</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+X">Xiaodong Feng</a>, <a href="/search/eess?searchtype=author&query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yu Pan</a>, <a href="/search/eess?searchtype=author&query=Song%2C+W">Wenhui Song</a>, <a href="/search/eess?searchtype=author&query=Birchall%2C+M">Martin Birchall</a>, <a href="/search/eess?searchtype=author&query=Occhipint%2C+L+G">Luigi G. Occhipint</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18266v1-abstract-short" style="display: inline;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to ena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18266v1-abstract-full" style="display: none;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to enable fluent, emotionally expressive communication. The system utilizes ultrasensitive textile strain sensors to capture high-quality signals from the neck area and supports token-level processing for real-time, continuous speech decoding, enabling seamless, delay-free communication. In tests with five stroke patients with dysarthria, IT's LLM agents intelligently corrected token errors and enriched sentence-level emotional and logical coherence, achieving low error rates (4.2% word error rate, 2.9% sentence error rate) and a 55% increase in user satisfaction. This work establishes a portable, intuitive communication platform for patients with dysarthria with the potential to be applied broadly across different neurological conditions and in multi-language support systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v1-abstract-full').style.display = 'none'; document.getElementById('2411.18266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 45 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17707">arXiv:2411.17707</a> <span> [<a href="https://arxiv.org/pdf/2411.17707">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Composite Fault Diagnosis Model for NPPs Based on Bayesian-EfficientNet Module </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+S">Siwei Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jiangwen Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hua Lin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17707v1-abstract-short" style="display: inline;"> This article focuses on the faults of important mechanical components such as pumps, valves, and pipelines in the reactor coolant system, main steam system, condensate system, and main feedwater system of nuclear power plants (NPPs). It proposes a composite multi-fault diagnosis model based on Bayesian algorithm and EfficientNet large model using data-driven deep learning fault diagnosis technolog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17707v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17707v1-abstract-full" style="display: none;"> This article focuses on the faults of important mechanical components such as pumps, valves, and pipelines in the reactor coolant system, main steam system, condensate system, and main feedwater system of nuclear power plants (NPPs). It proposes a composite multi-fault diagnosis model based on Bayesian algorithm and EfficientNet large model using data-driven deep learning fault diagnosis technology. The aim is to evaluate the effectiveness of automatic deep learning-based large model technology through transfer learning in nuclear power plant scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17707v1-abstract-full').style.display = 'none'; document.getElementById('2411.17707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15395">arXiv:2411.15395</a> <span> [<a href="https://arxiv.org/pdf/2411.15395">pdf</a>, <a href="https://arxiv.org/format/2411.15395">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> ChatBCI: A P300 Speller BCI Leveraging Large Language Models for Improved Sentence Composition in Realistic Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hong%2C+J">Jiazhen Hong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weinan Wang</a>, <a href="/search/eess?searchtype=author&query=Najafizadeh%2C+L">Laleh Najafizadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15395v1-abstract-short" style="display: inline;"> P300 speller BCIs allow users to compose sentences by selecting target keys on a GUI through the detection of P300 component in their EEG signals following visual stimuli. Most P300 speller BCIs require users to spell words letter by letter, or the first few initial letters, resulting in high keystroke demands that increase time, cognitive load, and fatigue. This highlights the need for more effic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15395v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15395v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15395v1-abstract-full" style="display: none;"> P300 speller BCIs allow users to compose sentences by selecting target keys on a GUI through the detection of P300 component in their EEG signals following visual stimuli. Most P300 speller BCIs require users to spell words letter by letter, or the first few initial letters, resulting in high keystroke demands that increase time, cognitive load, and fatigue. This highlights the need for more efficient, user-friendly methods for faster sentence composition. In this work, we introduce ChatBCI, a P300 speller BCI that leverages the zero-shot learning capabilities of large language models (LLMs) to suggest words from user-spelled initial letters or predict the subsequent word(s), reducing keystrokes and accelerating sentence composition. ChatBCI retrieves word suggestions through remote queries to the GPT-3.5 API. A new GUI, displaying GPT-3.5 word suggestions as extra keys is designed. SWLDA is used for the P300 classification. Seven subjects completed two online spelling tasks: 1) copy-spelling a self-composed sentence using ChatBCI, and 2) improvising a sentence using ChatBCI's word suggestions. Results demonstrate that in Task 1, on average, ChatBCI outperforms letter-by-letter BCI spellers, reducing time and keystrokes by 62.14% and 53.22%, respectively, and increasing information transfer rate by 198.96%. In Task 2, ChatBCI achieves 80.68% keystroke savings and a record 8.53 characters/min for typing speed. Overall, ChatBCI, by employing remote LLM queries, enhances sentence composition in realistic scenarios, significantly outperforming traditional spellers without requiring local model training or storage. ChatBCI's (multi-) word predictions, combined with its new GUI, pave the way for developing next-generation speller BCIs that are efficient and effective for real-time communication, especially for users with communication and motor disabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15395v1-abstract-full').style.display = 'none'; document.getElementById('2411.15395v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12869">arXiv:2411.12869</a> <span> [<a href="https://arxiv.org/pdf/2411.12869">pdf</a>, <a href="https://arxiv.org/format/2411.12869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JSSC.2024.3464533">10.1109/JSSC.2024.3464533 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Omnidirectional Wireless Power Transfer for Millimetric Magnetoelectric Biomedical Implants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Z">Zhanghao Yu</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+Y">Yiwei Zou</a>, <a href="/search/eess?searchtype=author&query=Woods%2C+J+E">Joshua E Woods</a>, <a href="/search/eess?searchtype=author&query=Chari%2C+P">Prahalad Chari</a>, <a href="/search/eess?searchtype=author&query=Su%2C+Y">Yumin Su</a>, <a href="/search/eess?searchtype=author&query=Robinson%2C+J+T">Jacob T Robinson</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kaiyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12869v1-abstract-short" style="display: inline;"> Miniature bioelectronic implants promise revolutionary therapies for cardiovascular and neurological disorders. Wireless power transfer (WPT) is a significant method for miniaturization, eliminating the need for bulky batteries in devices. Despite successful demonstrations of millimetric battery free implants in animal models, the robustness and efficiency of WPT are known to degrade significantly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12869v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12869v1-abstract-full" style="display: none;"> Miniature bioelectronic implants promise revolutionary therapies for cardiovascular and neurological disorders. Wireless power transfer (WPT) is a significant method for miniaturization, eliminating the need for bulky batteries in devices. Despite successful demonstrations of millimetric battery free implants in animal models, the robustness and efficiency of WPT are known to degrade significantly under misalignment incurred by body movements, respiration, heart beating, and limited control of implant orientation during surgery. This article presents an omnidirectional WPT platform for millimetric bioelectronic implants, employing the emerging magnetoelectric (ME) WPT modality, and magnetic field steering technique based on multiple transmitter (TX) coils. To accurately sense the weak coupling in a miniature implant and adaptively control the multicoil TX array in a closed loop, we develop an active echo (AE) scheme using a tiny coil on the implant. Our prototype comprises a fully integrated 14.2 mm3 implantable stimulator embedding a custom low power system on chip (SoC) powered by an ME film, a TX with a custom three channel AE RX chip, and a multicoil TX array with mutual inductance cancellation. The AE RX achieves negative 161 dBm per Hz input referred noise with 64 dB gain tuning range to reliably sense the AE signal, and offers fast polarity detection for driver control. AE simultaneously enhances the robustness, efficiency, and charging range of ME WPT. Under 90 degree rotation from the ideal position, our omnidirectional WPT system achieves 6.8x higher power transfer efficiency (PTE) than a single coil baseline. The tracking error of AE negligibly degrades the PTE by less than 2 percent from using ideal control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12869v1-abstract-full').style.display = 'none'; document.getElementById('2411.12869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 27 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Journal of Solid-State Circuits, Volume: 59, Issue: 11, Page(s): 3599 - 3611, November 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12669">arXiv:2411.12669</a> <span> [<a href="https://arxiv.org/pdf/2411.12669">pdf</a>, <a href="https://arxiv.org/ps/2411.12669">ps</a>, <a href="https://arxiv.org/format/2411.12669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Constrained Coding and Deep Learning Aided Threshold Detection for Resistive Memories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhong%2C+X">Xingwei Zhong</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+K">Kui Cai</a>, <a href="/search/eess?searchtype=author&query=Song%2C+G">Guanghui Song</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weijie Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12669v1-abstract-short" style="display: inline;"> Resistive random access memory (ReRAM) is a promising emerging non-volatile memory (NVM) technology that shows high potential for both data storage and computing. However, its crossbar array architecture leads to the sneak path problem, which may severely degrade the reliability of data stored in the ReRAM cell. Due to the complication of memory physics and unique features of the sneak path induce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12669v1-abstract-full" style="display: none;"> Resistive random access memory (ReRAM) is a promising emerging non-volatile memory (NVM) technology that shows high potential for both data storage and computing. However, its crossbar array architecture leads to the sneak path problem, which may severely degrade the reliability of data stored in the ReRAM cell. Due to the complication of memory physics and unique features of the sneak path induced interference (SPI), it is difficult to derive an accurate channel model for it. The deep learning (DL)-based detection scheme \cite{zhong2020sneakdl} can better mitigate the SPI, at the cost of additional power consumption and read latency. In this letter, we first propose a novel CC scheme which can not only reduce the SPI in the memory array, but also effectively differentiate the memory arrays into two categories of sneak-path-free and sneak-path-affected arrays. For the sneak-path-free arrays, we can use a simple middle-point threshold detector to detect the low and high resistance cells of ReRAM. For the sneak-path-affected arrays, a DL detector is first trained off-line (prior to the data detection of ReRAM). To avoid the additional power consumption and latency introduced by the DL detector, we further propose a DL-based threshold detector, whose detection threshold can be derived based on the outputs of the DL detector. It is then utilized for the online data detection of all the identified sneak-path-affected arrays. Simulation results demonstrate that the above CC and DL aided threshold detection scheme can effectively mitigate the SPI of the ReRAM array and achieve better error rate performance than the prior art detection schemes, without the prior knowledge of the channel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12669v1-abstract-full').style.display = 'none'; document.getElementById('2411.12669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12353">arXiv:2411.12353</a> <span> [<a href="https://arxiv.org/pdf/2411.12353">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Service Restoration for Distribution Systems Based on Semi-Analytical Metamodeling of Decision-Dependent Interruption Cost and Cold Load Pickup </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Minwu Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hongbin Wang</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+G">Gaoqiang Peng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hongzhou Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12353v1-abstract-short" style="display: inline;"> Developing optimized restoration strategies for power distribution systems (PDSs) is essential to meet the pressing demand for enhanced resilience. Prior knowledge of customer interruption cost (CIC) and load restoration behaviors, particularly cold load pickup (CLPU), is crucial for guiding effective restoration; however, both are reciprocally affected by the realized customer interruption durati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12353v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12353v1-abstract-full" style="display: none;"> Developing optimized restoration strategies for power distribution systems (PDSs) is essential to meet the pressing demand for enhanced resilience. Prior knowledge of customer interruption cost (CIC) and load restoration behaviors, particularly cold load pickup (CLPU), is crucial for guiding effective restoration; however, both are reciprocally affected by the realized customer interruption duration (CID), making them decision-dependent and challenging to model especially given the limited understanding of underlying physical mechanisms. This paper presents a novel approach by constructing tractable metamodels to capture the varying patterns of CIC and CLPU with CID - patterns which can be derived from limited data and reflect observed surface-level correlations rather than underlying mechanisms, thereby enabling practical surrogate modeling of these decision-dependencies. Specifically, quadratic functions are used to model the increasing rate of CIC with CID based on data fitting. Several defining characteristics of CLPU are extracted, each modeled in a piecewise linear form relative to CID, and the actual restored load accounting for CLPU is subsequently retrieved. Building on these metamodels, a PDS restoration optimization model is constructed, incorporating mobile energy storage systems (MESSs) and network reconfiguration. Case studies validate our approach and also highlight MESS's unique potential to accelerate CLPU-related restoration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12353v1-abstract-full').style.display = 'none'; document.getElementById('2411.12353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 10 figures, submitted to IEEE Transactions on Smart Grid</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06399">arXiv:2411.06399</a> <span> [<a href="https://arxiv.org/pdf/2411.06399">pdf</a>, <a href="https://arxiv.org/format/2411.06399">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> PSELDNets: Pre-trained Neural Networks on Large-scale Synthetic Datasets for Sound Event Localization and Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jinbo Hu</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yin Cao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+M">Ming Wu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+F">Fang Kang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+F">Feiran Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Plumbley%2C+M+D">Mark D. Plumbley</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06399v1-abstract-short" style="display: inline;"> Sound event localization and detection (SELD) has seen substantial advancements through learning-based methods. These systems, typically trained from scratch on specific datasets, have shown considerable generalization capabilities. Recently, deep neural networks trained on large-scale datasets have achieved remarkable success in the sound event classification (SEC) field, prompting an open questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06399v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06399v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06399v1-abstract-full" style="display: none;"> Sound event localization and detection (SELD) has seen substantial advancements through learning-based methods. These systems, typically trained from scratch on specific datasets, have shown considerable generalization capabilities. Recently, deep neural networks trained on large-scale datasets have achieved remarkable success in the sound event classification (SEC) field, prompting an open question of whether these advancements can be extended to develop general-purpose SELD models. In this paper, leveraging the power of pre-trained SEC models, we propose pre-trained SELD networks (PSELDNets) on large-scale synthetic datasets. These synthetic datasets, generated by convolving sound events with simulated spatial room impulse responses (SRIRs), contain 1,167 hours of audio clips with an ontology of 170 sound classes. These PSELDNets are transferred to downstream SELD tasks. When we adapt PSELDNets to specific scenarios, particularly in low-resource data cases, we introduce a data-efficient fine-tuning method, AdapterBit. PSELDNets are evaluated on a synthetic-test-set using collected SRIRs from TAU Spatial Room Impulse Response Database (TAU-SRIR DB) and achieve satisfactory performance. We also conduct our experiments to validate the transferability of PSELDNets to three publicly available datasets and our own collected audio recordings. Results demonstrate that PSELDNets surpass state-of-the-art systems across all publicly available datasets. Given the need for direction-of-arrival estimation, SELD generally relies on sufficient multi-channel audio clips. However, incorporating the AdapterBit, PSELDNets show more efficient adaptability to various tasks using minimal multi-channel or even just monophonic audio clips, outperforming the traditional fine-tuning approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06399v1-abstract-full').style.display = 'none'; document.getElementById('2411.06399v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures. The code is available at https://github.com/Jinbo-Hu/PSELDNets</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03085">arXiv:2411.03085</a> <span> [<a href="https://arxiv.org/pdf/2411.03085">pdf</a>, <a href="https://arxiv.org/format/2411.03085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2024.3446242">10.1109/TASLP.2024.3446242 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Speech Separation with Pretrained Frontend to Minimize Domain Mismatch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wupeng Wang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Z">Zexu Pan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinke Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03085v1-abstract-short" style="display: inline;"> Speech separation seeks to separate individual speech signals from a speech mixture. Typically, most separation models are trained on synthetic data due to the unavailability of target reference in real-world cocktail party scenarios. As a result, there exists a domain gap between real and synthetic data when deploying speech separation models in real-world applications. In this paper, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03085v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03085v1-abstract-full" style="display: none;"> Speech separation seeks to separate individual speech signals from a speech mixture. Typically, most separation models are trained on synthetic data due to the unavailability of target reference in real-world cocktail party scenarios. As a result, there exists a domain gap between real and synthetic data when deploying speech separation models in real-world applications. In this paper, we propose a self-supervised domain-invariant pretrained (DIP) frontend that is exposed to mixture data without the need for target reference speech. The DIP frontend utilizes a Siamese network with two innovative pretext tasks, mixture predictive coding (MPC) and mixture invariant coding (MIC), to capture shared contextual cues between real and synthetic unlabeled mixtures. Subsequently, we freeze the DIP frontend as a feature extractor when training the downstream speech separation models on synthetic data. By pretraining the DIP frontend with the contextual cues, we expect that the speech separation skills learned from synthetic data can be effectively transferred to real data. To benefit from the DIP frontend, we introduce a novel separation pipeline to align the feature resolution of the separation models. We evaluate the speech separation quality on standard benchmarks and real-world datasets. The results confirm the superiority of our DIP frontend over existing speech separation models. This study underscores the potential of large-scale pretraining to enhance the quality and intelligibility of speech separation in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03085v1-abstract-full').style.display = 'none'; document.getElementById('2411.03085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE/ACM Transactions on Audio, Speech, and Language Processing.32(2024)4184-4198 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00888">arXiv:2411.00888</a> <span> [<a href="https://arxiv.org/pdf/2411.00888">pdf</a>, <a href="https://arxiv.org/format/2411.00888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Topology-Aware Graph Augmentation for Predicting Clinical Trajectories in Neurocognitive Disorders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuqi Fang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hong-Jun Li</a>, <a href="/search/eess?searchtype=author&query=Bozoki%2C+A">Andrea Bozoki</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Mingxia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00888v1-abstract-short" style="display: inline;"> Brain networks/graphs derived from resting-state functional MRI (fMRI) help study underlying pathophysiology of neurocognitive disorders by measuring neuronal activities in the brain. Some studies utilize learning-based methods for brain network analysis, but typically suffer from low model generalizability caused by scarce labeled fMRI data. As a notable self-supervised strategy, graph contrastiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00888v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00888v1-abstract-full" style="display: none;"> Brain networks/graphs derived from resting-state functional MRI (fMRI) help study underlying pathophysiology of neurocognitive disorders by measuring neuronal activities in the brain. Some studies utilize learning-based methods for brain network analysis, but typically suffer from low model generalizability caused by scarce labeled fMRI data. As a notable self-supervised strategy, graph contrastive learning helps leverage auxiliary unlabeled data. But existing methods generally arbitrarily perturb graph nodes/edges to generate augmented graphs, without considering essential topology information of brain networks. To this end, we propose a topology-aware graph augmentation (TGA) framework, comprising a pretext model to train a generalizable encoder on large-scale unlabeled fMRI cohorts and a task-specific model to perform downstream tasks on a small target dataset. In the pretext model, we design two novel topology-aware graph augmentation strategies: (1) hub-preserving node dropping that prioritizes preserving brain hub regions according to node importance, and (2) weight-dependent edge removing that focuses on keeping important functional connectivities based on edge weights. Experiments on 1, 688 fMRI scans suggest that TGA outperforms several state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00888v1-abstract-full').style.display = 'none'; document.getElementById('2411.00888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00335">arXiv:2411.00335</a> <span> [<a href="https://arxiv.org/pdf/2411.00335">pdf</a>, <a href="https://arxiv.org/format/2411.00335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> NCST: Neural-based Color Style Transfer for Video Retouching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xintao Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yaosen Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Siqin Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+X">Xuming Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00335v1-abstract-short" style="display: inline;"> Video color style transfer aims to transform the color style of an original video by using a reference style image. Most existing methods employ neural networks, which come with challenges like opaque transfer processes and limited user control over the outcomes. Typically, users cannot fine-tune the resulting images or videos. To tackle this issue, we introduce a method that predicts specific par… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00335v1-abstract-full" style="display: none;"> Video color style transfer aims to transform the color style of an original video by using a reference style image. Most existing methods employ neural networks, which come with challenges like opaque transfer processes and limited user control over the outcomes. Typically, users cannot fine-tune the resulting images or videos. To tackle this issue, we introduce a method that predicts specific parameters for color style transfer using two images. Initially, we train a neural network to learn the corresponding color adjustment parameters. When applying style transfer to a video, we fine-tune the network with key frames from the video and the chosen style image, generating precise transformation parameters. These are then applied to convert the color style of both images and videos. Our experimental results demonstrate that our algorithm surpasses current methods in color style transfer quality. Moreover, each parameter in our method has a specific, interpretable meaning, enabling users to understand the color style transfer process and allowing them to perform manual fine-tuning if desired. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00335v1-abstract-full').style.display = 'none'; document.getElementById('2411.00335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17799">arXiv:2410.17799</a> <span> [<a href="https://arxiv.org/pdf/2410.17799">pdf</a>, <a href="https://arxiv.org/format/2410.17799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+C">Chaohong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17799v1-abstract-short" style="display: inline;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17799v1-abstract-full" style="display: none;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backchannels, and overlapping speech. In this paper, we introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex conversation, capable of effectively modeling the complex behaviors inherent to natural conversations with low latency. To achieve full-duplex communication capabilities, we propose a multi-stage post-training scheme that progressively adapts a text-based large language model (LLM) backbone into a speech-text dialogue LLM, capable of generating text and speech in real time, without modifying the architecture of the backbone LLM. The training process comprises three stages: modality alignment, half-duplex dialogue learning, and full-duplex dialogue learning. Throughout all training stages, we standardize the data using a flattening operation, which allows us to unify the training methods and the model architecture across different modalities and tasks. Our approach offers a straightforward modeling technique and a promising research direction for developing efficient and natural end-to-end full-duplex spoken dialogue systems. Audio samples of dialogues generated by OmniFlatten can be found at this web site (https://omniflatten.github.io/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'none'; document.getElementById('2410.17799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15620">arXiv:2410.15620</a> <span> [<a href="https://arxiv.org/pdf/2410.15620">pdf</a>, <a href="https://arxiv.org/format/2410.15620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Acoustic Model Optimization over Multiple Data Sources: Merging and Valuation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+V+J">Victor Junqiu Wei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weicheng Wang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+D">Di Jiang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+C">Conghui Tan</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+R">Rongzhong Lian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15620v1-abstract-short" style="display: inline;"> Due to the rising awareness of privacy protection and the voluminous scale of speech data, it is becoming infeasible for Automatic Speech Recognition (ASR) system developers to train the acoustic model with complete data as before. For example, the data may be owned by different curators, and it is not allowed to share with others. In this paper, we propose a novel paradigm to solve salient proble… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15620v1-abstract-full" style="display: none;"> Due to the rising awareness of privacy protection and the voluminous scale of speech data, it is becoming infeasible for Automatic Speech Recognition (ASR) system developers to train the acoustic model with complete data as before. For example, the data may be owned by different curators, and it is not allowed to share with others. In this paper, we propose a novel paradigm to solve salient problems plaguing the ASR field. In the first stage, multiple acoustic models are trained based upon different subsets of the complete speech data, while in the second phase, two novel algorithms are utilized to generate a high-quality acoustic model based upon those trained on data subsets. We first propose the Genetic Merge Algorithm (GMA), which is a highly specialized algorithm for optimizing acoustic models but suffers from low efficiency. We further propose the SGD-Based Optimizational Merge Algorithm (SOMA), which effectively alleviates the efficiency bottleneck of GMA and maintains superior model accuracy. Extensive experiments on public data show that the proposed methods can significantly outperform the state-of-the-art. Furthermore, we introduce Shapley Value to estimate the contribution score of the trained models, which is useful for evaluating the effectiveness of the data and providing fair incentives to their curators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15620v1-abstract-full').style.display = 'none'; document.getElementById('2410.15620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15078">arXiv:2410.15078</a> <span> [<a href="https://arxiv.org/pdf/2410.15078">pdf</a>, <a href="https://arxiv.org/format/2410.15078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Independent Feature Enhanced Crossmodal Fusion for Match-Mismatch Classification of Speech Stimulus and EEG Response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+S">Shitong Fan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenbo Wang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Q">Qiaoxi Zhu</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+J">Jian Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15078v1-abstract-short" style="display: inline;"> It is crucial for auditory attention decoding to classify matched and mismatched speech stimuli with corresponding EEG responses by exploring their relationship. However, existing methods often adopt two independent networks to encode speech stimulus and EEG response, which neglect the relationship between these signals from the two modalities. In this paper, we propose an independent feature enha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15078v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15078v1-abstract-full" style="display: none;"> It is crucial for auditory attention decoding to classify matched and mismatched speech stimuli with corresponding EEG responses by exploring their relationship. However, existing methods often adopt two independent networks to encode speech stimulus and EEG response, which neglect the relationship between these signals from the two modalities. In this paper, we propose an independent feature enhanced crossmodal fusion model (IFE-CF) for match-mismatch classification, which leverages the fusion feature of the speech stimulus and the EEG response to achieve auditory EEG decoding. Specifically, our IFE-CF contains a crossmodal encoder to encode the speech stimulus and the EEG response with a two-branch structure connected via crossmodal attention mechanism in the encoding process, a multi-channel fusion module to fuse features of two modalities by aggregating the interaction feature obtained from the crossmodal encoder and the independent feature obtained from the speech stimulus and EEG response, and a predictor to give the matching result. In addition, the causal mask is introduced to consider the time delay of the speech-EEG pair in the crossmodal encoder, which further enhances the feature representation for match-mismatch classification. Experiments demonstrate our method's effectiveness with better classification accuracy, as compared with the baseline of the Auditory EEG Decoding Challenge 2023. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15078v1-abstract-full').style.display = 'none'; document.getElementById('2410.15078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Shitong Fan and Wenbo Wang contributed equally. Accepted by the International Symposium on Chinese Spoken Language Processing (ISCSLP) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12320">arXiv:2410.12320</a> <span> [<a href="https://arxiv.org/pdf/2410.12320">pdf</a>, <a href="https://arxiv.org/format/2410.12320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical DRL Approach for Resource Optimization in Multi-RIS Multi-Operator Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haocheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Z">Zhiping Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12320v1-abstract-short" style="display: inline;"> As reconfigurable intelligent surfaces (RIS) emerge as a pivotal technology in the upcoming sixth-generation (6G) networks, their deployment within practical multiple operator (OP) networks presents significant challenges, including the coordination of RIS configurations among OPs, interference management, and privacy maintenance. A promising strategy is to treat RIS as a public resource managed b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12320v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12320v1-abstract-full" style="display: none;"> As reconfigurable intelligent surfaces (RIS) emerge as a pivotal technology in the upcoming sixth-generation (6G) networks, their deployment within practical multiple operator (OP) networks presents significant challenges, including the coordination of RIS configurations among OPs, interference management, and privacy maintenance. A promising strategy is to treat RIS as a public resource managed by an RIS provider (RP), which can enhance resource allocation efficiency by allowing dynamic access for multiple OPs. However, the intricate nature of coordinating management and optimizing RIS configurations significantly complicates the implementation process. In this paper, we propose a hierarchical deep reinforcement learning (HDRL) approach that decomposes the complicated RIS resource optimization problem into several subtasks. Specifically, a top-level RP-agent is responsible for RIS allocation, while low-level OP-agents control their assigned RISs and handle beamforming, RIS phase-shifts, and user association. By utilizing the semi-Markov decision process (SMDP) theory, we establish a sophisticated interaction mechanism between the RP and OPs, and introduce an advanced hierarchical proximal policy optimization (HPPO) algorithm. Furthermore, we propose an improved sequential-HPPO (S-HPPO) algorithm to address the curse of dimensionality encountered with a single RP-agent. Experimental results validate the stability of the HPPO algorithm across various environmental parameters, demonstrating its superiority over other benchmarks for joint resource optimization. Finally, we conduct a detailed comparative analysis between the proposed S-HPPO and HPPO algorithms, showcasing that the S-HPPO algorithm achieves faster convergence and improved performance in large-scale RIS allocation scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12320v1-abstract-full').style.display = 'none'; document.getElementById('2410.12320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06757">arXiv:2410.06757</a> <span> [<a href="https://arxiv.org/pdf/2410.06757">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diff-FMT: Diffusion Models for Fluorescence Molecular Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+Q">Qianqian Xue</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenjian Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Guanglei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06757v1-abstract-short" style="display: inline;"> Fluorescence molecular tomography (FMT) is a real-time, noninvasive optical imaging technology that plays a significant role in biomedical research. Nevertheless, the ill-posedness of the inverse problem poses huge challenges in FMT reconstructions. Previous various deep learning algorithms have been extensively explored to address the critical issues, but they remain faces the challenge of high d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06757v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06757v1-abstract-full" style="display: none;"> Fluorescence molecular tomography (FMT) is a real-time, noninvasive optical imaging technology that plays a significant role in biomedical research. Nevertheless, the ill-posedness of the inverse problem poses huge challenges in FMT reconstructions. Previous various deep learning algorithms have been extensively explored to address the critical issues, but they remain faces the challenge of high data dependency with poor image quality. In this paper, we, for the first time, propose a FMT reconstruction method based on a denoising diffusion probabilistic model (DDPM), termed Diff-FMT, which is capable of obtaining high-quality reconstructed images from noisy images. Specifically, we utilize the noise addition mechanism of DDPM to generate diverse training samples. Through the step-by-step probability sampling mechanism in the inverse process, we achieve fine-grained reconstruction of the image, avoiding issues such as loss of image detail that can occur with end-to-end deep-learning methods. Additionally, we introduce the fluorescence signals as conditional information in the model training to sample a reconstructed image that is highly consistent with the input fluorescence signals from the noisy images. Numerous experimental results show that Diff-FMT can achieve high-resolution reconstruction images without relying on large-scale datasets compared with other cutting-edge algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06757v1-abstract-full').style.display = 'none'; document.getElementById('2410.06757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06584">arXiv:2410.06584</a> <span> [<a href="https://arxiv.org/pdf/2410.06584">pdf</a>, <a href="https://arxiv.org/format/2410.06584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Two Birds With One Stone: Enhancing Communication and Sensing via Multi-Functional RIS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+W">Wanli Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+A">Ailing Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/eess?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/eess?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Schober%2C+R">Robert Schober</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06584v1-abstract-short" style="display: inline;"> In this article, we propose new network architectures that integrate multi-functional reconfigurable intelligent surfaces (MF-RISs) into 6G networks to enhance both communication and sensing capabilities. Firstly, we elaborate how to leverage MF-RISs for improving communication performance in different communication modes including unicast, mulitcast, and broadcast and for different multi-access s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06584v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06584v1-abstract-full" style="display: none;"> In this article, we propose new network architectures that integrate multi-functional reconfigurable intelligent surfaces (MF-RISs) into 6G networks to enhance both communication and sensing capabilities. Firstly, we elaborate how to leverage MF-RISs for improving communication performance in different communication modes including unicast, mulitcast, and broadcast and for different multi-access schemes. Next, we emphasize synergistic benefits of integrating MF-RISs with wireless sensing, enabling more accurate and efficient target detection in 6G networks. Furthermore, we present two schemes that utilize MF-RISs to enhance the performance of integrated sensing and communication (ISAC). We also study multi-objective optimization to achieve the optimal trade-off between communication and sensing performance. Finally, we present numerical results to show the performance improvements offered by MF-RISs compared to conventional RISs in ISAC. We also outline key research directions for MF-RIS under the ambition of 6G. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06584v1-abstract-full').style.display = 'none'; document.getElementById('2410.06584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, submitted to IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05647">arXiv:2410.05647</a> <span> [<a href="https://arxiv.org/pdf/2410.05647">pdf</a>, <a href="https://arxiv.org/format/2410.05647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FGCL: Fine-grained Contrastive Learning For Mandarin Stuttering Event Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Han Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yiquan Zhou</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+H">Hongwu Ding</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05647v1-abstract-short" style="display: inline;"> This paper presents the T031 team's approach to the StutteringSpeech Challenge in SLT2024. Mandarin Stuttering Event Detection (MSED) aims to detect instances of stuttering events in Mandarin speech. We propose a detailed acoustic analysis method to improve the accuracy of stutter detection by capturing subtle nuances that previous Stuttering Event Detection (SED) techniques have overlooked. To th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05647v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05647v1-abstract-full" style="display: none;"> This paper presents the T031 team's approach to the StutteringSpeech Challenge in SLT2024. Mandarin Stuttering Event Detection (MSED) aims to detect instances of stuttering events in Mandarin speech. We propose a detailed acoustic analysis method to improve the accuracy of stutter detection by capturing subtle nuances that previous Stuttering Event Detection (SED) techniques have overlooked. To this end, we introduce the Fine-Grained Contrastive Learning (FGCL) framework for MSED. Specifically, we model the frame-level probabilities of stuttering events and introduce a mining algorithm to identify both easy and confusing frames. Then, we propose a stutter contrast loss to enhance the distinction between stuttered and fluent speech frames, thereby improving the discriminative capability of stuttered feature embeddings. Extensive evaluations on English and Mandarin datasets demonstrate the effectiveness of FGCL, achieving a significant increase of over 5.0% in F1 score on Mandarin data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05647v1-abstract-full').style.display = 'none'; document.getElementById('2410.05647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00620">arXiv:2410.00620</a> <span> [<a href="https://arxiv.org/pdf/2410.00620">pdf</a>, <a href="https://arxiv.org/ps/2410.00620">ps</a>, <a href="https://arxiv.org/format/2410.00620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Interacting Multiple Model Particle Filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Brady%2C+J">John-Joseph Brady</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuhui Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Elvira%2C+V">V铆ctor Elvira</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00620v1-abstract-short" style="display: inline;"> We propose a sequential Monte Carlo algorithm for parameter learning when the studied model exhibits random discontinuous jumps in behaviour. To facilitate the learning of high dimensional parameter sets, such as those associated to neural networks, we adopt the emerging framework of differentiable particle filtering, wherein parameters are trained by gradient descent. We design a new differentiab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00620v1-abstract-full" style="display: none;"> We propose a sequential Monte Carlo algorithm for parameter learning when the studied model exhibits random discontinuous jumps in behaviour. To facilitate the learning of high dimensional parameter sets, such as those associated to neural networks, we adopt the emerging framework of differentiable particle filtering, wherein parameters are trained by gradient descent. We design a new differentiable interacting multiple model particle filter to be capable of learning the individual behavioural regimes and the model which controls the jumping simultaneously. In contrast to previous approaches, our algorithm allows control of the computational effort assigned per regime whilst using the probability of being in a given regime to guide sampling. Furthermore, we develop a new gradient estimator that has a lower variance than established approaches and remains fast to compute, for which we prove consistency. We establish new theoretical results of the presented algorithms and demonstrate superior numerical performance compared to the previous state-of-the-art algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00620v1-abstract-full').style.display = 'none'; document.getElementById('2410.00620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62M20; 62F12 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00013">arXiv:2410.00013</a> <span> [<a href="https://arxiv.org/pdf/2410.00013">pdf</a>, <a href="https://arxiv.org/format/2410.00013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing EEG Signal Generation through a Hybrid Approach Integrating Reinforcement Learning and Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=An%2C+Y">Yang An</a>, <a href="/search/eess?searchtype=author&query=Tong%2C+Y">Yuhao Tong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weikai Wang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+S+W">Steven W. Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00013v1-abstract-short" style="display: inline;"> The present study introduces an innovative approach to the synthesis of Electroencephalogram (EEG) signals by integrating diffusion models with reinforcement learning. This integration addresses key challenges associated with traditional EEG data acquisition, including participant burden, privacy concerns, and the financial costs of obtaining high-fidelity clinical data. Our methodology enhances t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00013v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00013v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00013v1-abstract-full" style="display: none;"> The present study introduces an innovative approach to the synthesis of Electroencephalogram (EEG) signals by integrating diffusion models with reinforcement learning. This integration addresses key challenges associated with traditional EEG data acquisition, including participant burden, privacy concerns, and the financial costs of obtaining high-fidelity clinical data. Our methodology enhances the generation of EEG signals with detailed temporal and spectral features, enriching the authenticity and diversity of synthetic datasets. The uniqueness of our approach lies in its capacity to concurrently model time-domain characteristics, such as waveform morphology, and frequency-domain features, including rhythmic brainwave patterns, within a cohesive generative framework. This is executed through the reinforcement learning model's autonomous selection of parameter update strategies, which steers the diffusion process to accurately reflect the complex dynamics inherent in EEG signals. We validate the efficacy of our approach using both the BCI Competition IV 2a dataset and a proprietary dataset, each collected under stringent experimental conditions. Our results indicate that the method preserves participant privacy by generating synthetic data that lacks biometric identifiers and concurrently improves the efficiency of model training by minimizing reliance on large annotated datasets. This research offers dual contributions: firstly, it advances EEG research by providing a novel tool for data augmentation and the advancement of machine learning algorithms; secondly, it enhances brain-computer interface technologies by offering a robust solution for training models on diverse and representative EEG datasets. Collectively, this study establishes a foundation for future investigations in neurological care and the development of tailored treatment protocols in neurorehabilitation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00013v1-abstract-full').style.display = 'none'; document.getElementById('2410.00013v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19276">arXiv:2409.19276</a> <span> [<a href="https://arxiv.org/pdf/2409.19276">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-based Automated Diagnosis of Obstructive Sleep Apnea and Sleep Stage Classification in Children Using Millimeter-wave Radar and Pulse Oximeter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+R">Ruobing Song</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yunxiao Wu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+L">Li Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhaoxi Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Z">Zhifei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19276v2-abstract-short" style="display: inline;"> Study Objectives: To evaluate the agreement between the millimeter-wave radar-based device and polysomnography (PSG) in diagnosis of obstructive sleep apnea (OSA) and classification of sleep stage in children. Methods: 281 children, aged 1 to 18 years, who underwent sleep monitoring between September and November 2023 at the Sleep Center of Beijing Children's Hospital, Capital Medical University,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19276v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19276v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19276v2-abstract-full" style="display: none;"> Study Objectives: To evaluate the agreement between the millimeter-wave radar-based device and polysomnography (PSG) in diagnosis of obstructive sleep apnea (OSA) and classification of sleep stage in children. Methods: 281 children, aged 1 to 18 years, who underwent sleep monitoring between September and November 2023 at the Sleep Center of Beijing Children's Hospital, Capital Medical University, were recruited in the study. All enrolled children underwent sleep monitoring by PSG and the millimeter-wave radar-based device, QSA600, simultaneously. QSA600 recordings were automatically analyzed using a deep learning model meanwhile the PSG data was manually scored. Results: The Obstructive Apnea-Hypopnea Index (OAHI) obtained from QSA600 and PSG demonstrates a high level of agreement with an intraclass correlation coefficient of 0.945 (95% CI: 0.93 to 0.96). Bland-Altman analysis indicates that the mean difference of OAHI between QSA600 and PSG is -0.10 events/h (95% CI: -11.15 to 10.96). The deep learning model evaluated through cross-validation showed good sensitivity (81.8%, 84.3% and 89.7%) and specificity (90.5%, 95.3% and 97.1%) values for diagnosing children with OAHI>1, OAHI>5 and OAHI>10. The area under the receiver operating characteristic curve is 0.923, 0.955 and 0.988, respectively. For sleep stage classification, the model achieved Kappa coefficients of 0.854, 0.781, and 0.734, with corresponding overall accuracies of 95.0%, 84.8%, and 79.7% for Wake-sleep classification, Wake-REM-Light-Deep classification, and Wake-REM-N1-N2 N3 classification, respectively. Conclusions: QSA600 has demonstrated high agreement with PSG in diagnosing OSA and performing sleep staging in children. The device is portable, low-load and suitable for follow up and long-term pediatric sleep assessment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19276v2-abstract-full').style.display = 'none'; document.getElementById('2409.19276v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19217">arXiv:2409.19217</a> <span> [<a href="https://arxiv.org/pdf/2409.19217">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Detection of Sleep Apnea-Hypopnea Events Using Millimeter-wave Radar and Pulse Oximeter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhaoxi Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zetao Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+X">Xi Guo</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Gang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19217v1-abstract-short" style="display: inline;"> Obstructive Sleep Apnea-Hypopnea Syndrome (OSAHS) is a sleep-related breathing disorder associated with significant morbidity and mortality worldwide. The gold standard for OSAHS diagnosis, polysomnography (PSG), faces challenges in popularization due to its high cost and complexity. Recently, radar has shown potential in detecting sleep apnea-hypopnea events (SAE) with the advantages of low cost… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19217v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19217v1-abstract-full" style="display: none;"> Obstructive Sleep Apnea-Hypopnea Syndrome (OSAHS) is a sleep-related breathing disorder associated with significant morbidity and mortality worldwide. The gold standard for OSAHS diagnosis, polysomnography (PSG), faces challenges in popularization due to its high cost and complexity. Recently, radar has shown potential in detecting sleep apnea-hypopnea events (SAE) with the advantages of low cost and non-contact monitoring. However, existing studies, especially those using deep learning, employ segment-based classification approach for SAE detection, making the task of event quantity estimation difficult. Additionally, radar-based SAE detection is susceptible to interference from body movements and the environment. Oxygen saturation (SpO2) can offer valuable information about OSAHS, but it also has certain limitations and cannot be used alone for diagnosis. In this study, we propose a method using millimeter-wave radar and pulse oximeter to detect SAE, called ROSA. It fuses information from both sensors, and directly predicts the temporal localization of SAE. Experimental results demonstrate a high degree of consistency (ICC=0.9864) between AHI from ROSA and PSG. This study presents an effective method with low-load device for the diagnosis of OSAHS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19217v1-abstract-full').style.display = 'none'; document.getElementById('2409.19217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12352">arXiv:2409.12352</a> <span> [<a href="https://arxiv.org/pdf/2409.12352">pdf</a>, <a href="https://arxiv.org/format/2409.12352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> META-CAT: Speaker-Informed Speech Embeddings via Meta Information Concatenation for Multi-talker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jinhan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+M">Myungjong Kim</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N">Nithin Koluguri</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12352v1-abstract-short" style="display: inline;"> We propose a novel end-to-end multi-talker automatic speech recognition (ASR) framework that enables both multi-speaker (MS) ASR and target-speaker (TS) ASR. Our proposed model is trained in a fully end-to-end manner, incorporating speaker supervision from a pre-trained speaker diarization module. We introduce an intuitive yet effective method for masking ASR encoder activations using output from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12352v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12352v1-abstract-full" style="display: none;"> We propose a novel end-to-end multi-talker automatic speech recognition (ASR) framework that enables both multi-speaker (MS) ASR and target-speaker (TS) ASR. Our proposed model is trained in a fully end-to-end manner, incorporating speaker supervision from a pre-trained speaker diarization module. We introduce an intuitive yet effective method for masking ASR encoder activations using output from the speaker supervision module, a technique we term Meta-Cat (meta-information concatenation), that can be applied to both MS-ASR and TS-ASR. Our results demonstrate that the proposed architecture achieves competitive performance in both MS-ASR and TS-ASR tasks, without the need for traditional methods, such as neural mask estimation or masking at the audio or feature level. Furthermore, we demonstrate a glimpse of a unified dual-task model which can efficiently handle both MS-ASR and TS-ASR tasks. Thus, this work illustrates that a robust end-to-end multi-talker ASR framework can be implemented with a streamlined architecture, obviating the need for the complex speaker filtering mechanisms employed in previous studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12352v1-abstract-full').style.display = 'none'; document.getElementById('2409.12352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09352">arXiv:2409.09352</a> <span> [<a href="https://arxiv.org/pdf/2409.09352">pdf</a>, <a href="https://arxiv.org/format/2409.09352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MacST: Multi-Accent Speech Synthesis via Text Transliteration for Accent Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Inoue%2C+S">Sho Inoue</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wanxing Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+P">Pengcheng Zhu</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+M">Mengxiao Bi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09352v1-abstract-short" style="display: inline;"> In accented voice conversion or accent conversion, we seek to convert the accent in speech from one another while preserving speaker identity and semantic content. In this study, we formulate a novel method for creating multi-accented speech samples, thus pairs of accented speech samples by the same speaker, through text transliteration for training accent conversion systems. We begin by generatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09352v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09352v1-abstract-full" style="display: none;"> In accented voice conversion or accent conversion, we seek to convert the accent in speech from one another while preserving speaker identity and semantic content. In this study, we formulate a novel method for creating multi-accented speech samples, thus pairs of accented speech samples by the same speaker, through text transliteration for training accent conversion systems. We begin by generating transliterated text with Large Language Models (LLMs), which is then fed into multilingual TTS models to synthesize accented English speech. As a reference system, we built a sequence-to-sequence model on the synthetic parallel corpus for accent conversion. We validated the proposed method for both native and non-native English speakers. Subjective and objective evaluations further validate our dataset's effectiveness in accent conversion studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09352v1-abstract-full').style.display = 'none'; document.getElementById('2409.09352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page with Speech Demo: https://github.com/shinshoji01/MacST-project-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08585">arXiv:2409.08585</a> <span> [<a href="https://arxiv.org/pdf/2409.08585">pdf</a>, <a href="https://arxiv.org/format/2409.08585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing 4D Lookup Table for Low-light Video Enhancement via Wavelet Priori </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+J">Jinhong He</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+M">Minglong Xue</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenhai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+M">Mingliang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08585v1-abstract-short" style="display: inline;"> Low-light video enhancement is highly demanding in maintaining spatiotemporal color consistency. Therefore, improving the accuracy of color mapping and keeping the latency low is challenging. Based on this, we propose incorporating Wavelet-priori for 4D Lookup Table (WaveLUT), which effectively enhances the color coherence between video frames and the accuracy of color mapping while maintaining lo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08585v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08585v1-abstract-full" style="display: none;"> Low-light video enhancement is highly demanding in maintaining spatiotemporal color consistency. Therefore, improving the accuracy of color mapping and keeping the latency low is challenging. Based on this, we propose incorporating Wavelet-priori for 4D Lookup Table (WaveLUT), which effectively enhances the color coherence between video frames and the accuracy of color mapping while maintaining low latency. Specifically, we use the wavelet low-frequency domain to construct an optimized lookup prior and achieve an adaptive enhancement effect through a designed Wavelet-prior 4D lookup table. To effectively compensate the a priori loss in the low light region, we further explore a dynamic fusion strategy that adaptively determines the spatial weights based on the correlation between the wavelet lighting prior and the target intensity structure. In addition, during the training phase, we devise a text-driven appearance reconstruction method that dynamically balances brightness and content through multimodal semantics-driven Fourier spectra. Extensive experiments on a wide range of benchmark datasets show that this method effectively enhances the previous method's ability to perceive the color space and achieves metric-favorable and perceptually oriented real-time enhancement while maintaining high efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08585v1-abstract-full').style.display = 'none'; document.getElementById('2409.08585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08552">arXiv:2409.08552</a> <span> [<a href="https://arxiv.org/pdf/2409.08552">pdf</a>, <a href="https://arxiv.org/format/2409.08552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Unified Audio Event Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+R">Ruijie Tao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wen Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08552v1-abstract-short" style="display: inline;"> Sound Event Detection (SED) detects regions of sound events, while Speaker Diarization (SD) segments speech conversations attributed to individual speakers. In SED, all speaker segments are classified as a single speech event, while in SD, non-speech sounds are treated merely as background noise. Thus, both tasks provide only partial analysis in complex audio scenarios involving both speech conver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08552v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08552v1-abstract-full" style="display: none;"> Sound Event Detection (SED) detects regions of sound events, while Speaker Diarization (SD) segments speech conversations attributed to individual speakers. In SED, all speaker segments are classified as a single speech event, while in SD, non-speech sounds are treated merely as background noise. Thus, both tasks provide only partial analysis in complex audio scenarios involving both speech conversation and non-speech sounds. In this paper, we introduce a novel task called Unified Audio Event Detection (UAED) for comprehensive audio analysis. UAED explores the synergy between SED and SD tasks, simultaneously detecting non-speech sound events and fine-grained speech events based on speaker identities. To tackle this task, we propose a Transformer-based UAED (T-UAED) framework and construct the UAED Data derived from the Librispeech dataset and DESED soundbank. Experiments demonstrate that the proposed framework effectively exploits task interactions and substantially outperforms the baseline that simply combines the outputs of SED and SD models. T-UAED also shows its versatility by performing comparably to specialized models for individual SED and SD tasks on DESED and CALLHOME datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08552v1-abstract-full').style.display = 'none'; document.getElementById('2409.08552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08525">arXiv:2409.08525</a> <span> [<a href="https://arxiv.org/pdf/2409.08525">pdf</a>, <a href="https://arxiv.org/ps/2409.08525">ps</a>, <a href="https://arxiv.org/format/2409.08525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Frequency Diverse RIS (FD-RIS) Enhanced Wireless Communications via Joint Distance-Angle Beamforming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiao%2C+H">Han Xiao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xiaoyan Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08525v1-abstract-short" style="display: inline;"> The conventional reconfigurable intelligent surface (RIS) assisted far-field communication systems can only implement angle beamforming, which actually limits the capability for reconfiguring the wireless propagation environment. To overcome this limitation, this paper proposes a newly designed frequency diverse RIS (FD-RIS), which can achieve joint distance-angle beamforming with the assistance o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08525v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08525v1-abstract-full" style="display: none;"> The conventional reconfigurable intelligent surface (RIS) assisted far-field communication systems can only implement angle beamforming, which actually limits the capability for reconfiguring the wireless propagation environment. To overcome this limitation, this paper proposes a newly designed frequency diverse RIS (FD-RIS), which can achieve joint distance-angle beamforming with the assistance of the time modulation technology. The signal processing model for FD-RIS-aided wireless communications is first derived. Then, an optimization problem aimed at maximizing the achievable rate is formulated where the frequency-time modulations are jointly optimized to achieve distance-angle beamforming. Furthermore, a novel iterative algorithm based on the cross-entropy optimization (CEO) framework is proposed to effectively handle the non-convex optimization problem. The numerical results validate that the proposed FD-RIS assisted communication scheme can achieve a notable performance improvement compared with the baseline scheme utilizing traditional RIS. In addition, the effectiveness of the proposed CEO algorithm is further verified by comparing with the benchmark using the genetic algorithm (GA). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08525v1-abstract-full').style.display = 'none'; document.getElementById('2409.08525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07614">arXiv:2409.07614</a> <span> [<a href="https://arxiv.org/pdf/2409.07614">pdf</a>, <a href="https://arxiv.org/format/2409.07614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FlowSep: Language-Queried Sound Separation with Rectified Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yuan%2C+Y">Yi Yuan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xubo Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Plumbley%2C+M+D">Mark D. Plumbley</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07614v1-abstract-short" style="display: inline;"> Language-queried audio source separation (LASS) focuses on separating sounds using textual descriptions of the desired sources. Current methods mainly use discriminative approaches, such as time-frequency masking, to separate target sounds and minimize interference from other sources. However, these models face challenges when separating overlapping soundtracks, which may lead to artifacts such as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07614v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07614v1-abstract-full" style="display: none;"> Language-queried audio source separation (LASS) focuses on separating sounds using textual descriptions of the desired sources. Current methods mainly use discriminative approaches, such as time-frequency masking, to separate target sounds and minimize interference from other sources. However, these models face challenges when separating overlapping soundtracks, which may lead to artifacts such as spectral holes or incomplete separation. Rectified flow matching (RFM), a generative model that establishes linear relations between the distribution of data and noise, offers superior theoretical properties and simplicity, but has not yet been explored in sound separation. In this work, we introduce FlowSep, a new generative model based on RFM for LASS tasks. FlowSep learns linear flow trajectories from noise to target source features within the variational autoencoder (VAE) latent space. During inference, the RFM-generated latent features are reconstructed into a mel-spectrogram via the pre-trained VAE decoder, followed by a pre-trained vocoder to synthesize the waveform. Trained on 1,680 hours of audio data, FlowSep outperforms the state-of-the-art models across multiple benchmarks, as evaluated with subjective and objective metrics. Additionally, our results show that FlowSep surpasses a diffusion-based LASS model in both separation quality and inference efficiency, highlighting its strong potential for audio source separation tasks. Code, pre-trained models and demos can be found at: https://audio-agi.github.io/FlowSep_demo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07614v1-abstract-full').style.display = 'none'; document.getElementById('2409.07614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06656">arXiv:2409.06656</a> <span> [<a href="https://arxiv.org/pdf/2409.06656">pdf</a>, <a href="https://arxiv.org/format/2409.06656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06656v1-abstract-short" style="display: inline;"> We propose Sortformer, a novel neural model for speaker diarization, trained with unconventional objectives compared to existing end-to-end diarization models. The permutation problem in speaker diarization has long been regarded as a critical challenge. Most prior end-to-end diarization systems employ permutation invariant loss (PIL), which optimizes for the permutation that yields the lowest err… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06656v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06656v1-abstract-full" style="display: none;"> We propose Sortformer, a novel neural model for speaker diarization, trained with unconventional objectives compared to existing end-to-end diarization models. The permutation problem in speaker diarization has long been regarded as a critical challenge. Most prior end-to-end diarization systems employ permutation invariant loss (PIL), which optimizes for the permutation that yields the lowest error. In contrast, we introduce Sort Loss, which enables a diarization model to autonomously resolve permutation, with or without PIL. We demonstrate that combining Sort Loss and PIL achieves performance competitive with state-of-the-art end-to-end diarization models trained exclusively with PIL. Crucially, we present a streamlined multispeaker ASR architecture that leverages Sortformer as a speaker supervision model, embedding speaker label estimation within the ASR encoder state using a sinusoidal kernel function. This approach resolves the speaker permutation problem through sorted objectives, effectively bridging speaker-label timestamps and speaker tokens. In our experiments, we show that the proposed multispeaker ASR architecture, enhanced with speaker supervision, improves performance via adapter techniques. Code and trained models will be made publicly available via the NVIDIA NeMo framework <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06656v1-abstract-full').style.display = 'none'; document.getElementById('2409.06656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02447">arXiv:2409.02447</a> <span> [<a href="https://arxiv.org/pdf/2409.02447">pdf</a>, <a href="https://arxiv.org/ps/2409.02447">ps</a>, <a href="https://arxiv.org/format/2409.02447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FDA-MIMO-Based Integrated Sensing and Communication System with Complex Coefficients Index Modulation for Multi-Target Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jian%2C+J">Jiangwei Jian</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+B">Bang Huang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+W">Wenkai Jia</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+M">Mingcheng Fu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen-Qin Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qimao Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02447v1-abstract-short" style="display: inline;"> The echo signals of frequency diverse array multiple-input multiple-output (FDA-MIMO) feature angle-range coupling, enabling simultaneous discrimination and estimation of multiple targets at different locations. In light of this, based on FDA-MIMO, this paper explores an sensing-centric integrated sensing and communication (ISAC) system for multi-target sensing. On the transmitter side, the comple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02447v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02447v1-abstract-full" style="display: none;"> The echo signals of frequency diverse array multiple-input multiple-output (FDA-MIMO) feature angle-range coupling, enabling simultaneous discrimination and estimation of multiple targets at different locations. In light of this, based on FDA-MIMO, this paper explores an sensing-centric integrated sensing and communication (ISAC) system for multi-target sensing. On the transmitter side, the complex coefficients index modulation (CCIM) scheme is designed, which carries extra bits by selecting complex coefficients from the coefficient vector. At the sensing receiver, we propose the FDA-MIMO-based spatial spectrum multi-target estimation (SSMTE) method, which first jointly estimates the angle and distance of targets and then estimates the velocities. To reduce the sensing computational complexity, the low-complexity spatial spectrum estimation (LCSSE) algorithm is proposed. LCSSE reduces the complexity without degrading the sensing performance by converting the joint angle-range search into two one-dimensional searches. To address the range ambiguity caused by frequency offset, a frequency offset design criterion (FODC) is proposed. It designs the integer and fractional components of the frequency offset to ensure the ambiguity distance exceeds the maximum sensing range, thereby alleviating parameters pairing errors. Moreover, the closed-form expressions for the bit error rate (BER) tight upper bound and the Cram茅r-Rao bound (CRB) are derived. Simulation results show that the proposed system excels in multi-target sensing and communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02447v1-abstract-full').style.display = 'none'; document.getElementById('2409.02447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01438">arXiv:2409.01438</a> <span> [<a href="https://arxiv.org/pdf/2409.01438">pdf</a>, <a href="https://arxiv.org/format/2409.01438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Resource-Efficient Adaptation of Speech Foundation Models for Multi-Speaker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01438v1-abstract-short" style="display: inline;"> Speech foundation models have achieved state-of-the-art (SoTA) performance across various tasks, such as automatic speech recognition (ASR) in hundreds of languages. However, multi-speaker ASR remains a challenging task for these models due to data scarcity and sparsity. In this paper, we present approaches to enable speech foundation models to process and understand multi-speaker speech with limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01438v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01438v1-abstract-full" style="display: none;"> Speech foundation models have achieved state-of-the-art (SoTA) performance across various tasks, such as automatic speech recognition (ASR) in hundreds of languages. However, multi-speaker ASR remains a challenging task for these models due to data scarcity and sparsity. In this paper, we present approaches to enable speech foundation models to process and understand multi-speaker speech with limited training data. Specifically, we adapt a speech foundation model for the multi-speaker ASR task using only telephonic data. Remarkably, the adapted model also performs well on meeting data without any fine-tuning, demonstrating the generalization ability of our approach. We conduct several ablation studies to analyze the impact of different parameters and strategies on model performance. Our findings highlight the effectiveness of our methods. Results show that less parameters give better overall cpWER, which, although counter-intuitive, provides insights into adapting speech foundation models for multi-speaker ASR tasks with minimal annotated data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01438v1-abstract-full').style.display = 'none'; document.getElementById('2409.01438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16532">arXiv:2408.16532</a> <span> [<a href="https://arxiv.org/pdf/2408.16532">pdf</a>, <a href="https://arxiv.org/format/2408.16532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16532v2-abstract-short" style="display: inline;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'inline'; document.getElementById('2408.16532v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16532v2-abstract-full" style="display: none;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domain: 1)extreme compression. By compressing the layers of quantizers and the temporal dimension of the discrete codec, one-second audio of 24kHz sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved subjective quality. Despite the reduced number of tokens, WavTokenizer achieves state-of-the-art reconstruction quality with outstanding UTMOS scores and inherently contains richer semantic information. Specifically, we achieve these results by designing a broader VQ space, extended contextual windows, and improved attention networks, as well as introducing a powerful multi-scale discriminator and an inverse Fourier transform structure. We conducted extensive reconstruction experiments in the domains of speech, audio, and music. WavTokenizer exhibited strong performance across various objective and subjective metrics compared to state-of-the-art models. We also tested semantic information, VQ utilization, and adaptability to generative models. Comprehensive ablation studies confirm the necessity of each module in WavTokenizer. The related code, demos, and pre-trained models are available at https://github.com/jishengpeng/WavTokenizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'none'; document.getElementById('2408.16532v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14977">arXiv:2408.14977</a> <span> [<a href="https://arxiv.org/pdf/2408.14977">pdf</a>, <a href="https://arxiv.org/format/2408.14977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LN-Gen: Rectal Lymph Nodes Generation via Anatomical Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+W">Weidong Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hantao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+S">Shouhong Wan</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+B">Bingbing Zou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wanqin Wang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+P">Peiquan Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14977v1-abstract-short" style="display: inline;"> Accurate segmentation of rectal lymph nodes is crucial for the staging and treatment planning of rectal cancer. However, the complexity of the surrounding anatomical structures and the scarcity of annotated data pose significant challenges. This study introduces a novel lymph node synthesis technique aimed at generating diverse and realistic synthetic rectal lymph node samples to mitigate the reli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14977v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14977v1-abstract-full" style="display: none;"> Accurate segmentation of rectal lymph nodes is crucial for the staging and treatment planning of rectal cancer. However, the complexity of the surrounding anatomical structures and the scarcity of annotated data pose significant challenges. This study introduces a novel lymph node synthesis technique aimed at generating diverse and realistic synthetic rectal lymph node samples to mitigate the reliance on manual annotation. Unlike direct diffusion methods, which often produce masks that are discontinuous and of suboptimal quality, our approach leverages an implicit SDF-based method for mask generation, ensuring the production of continuous, stable, and morphologically diverse masks. Experimental results demonstrate that our synthetic data significantly improves segmentation performance. Our work highlights the potential of diffusion model for accurately synthesizing structurally complex lesions, such as lymph nodes in rectal cancer, alleviating the challenge of limited annotated data in this field and aiding in advancements in rectal cancer diagnosis and treatment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14977v1-abstract-full').style.display = 'none'; document.getElementById('2408.14977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13106">arXiv:2408.13106</a> <span> [<a href="https://arxiv.org/pdf/2408.13106">pdf</a>, <a href="https://arxiv.org/format/2408.13106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13106v4-abstract-short" style="display: inline;"> Self-supervised learning has been proved to benefit a wide range of speech processing tasks, such as speech recognition/translation, speaker verification and diarization, etc. However, most of current approaches are computationally expensive. In this paper, we propose a simplified and more efficient self-supervised learning framework termed as NeMo Encoder for Speech Tasks (NEST). Specifically, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13106v4-abstract-full').style.display = 'inline'; document.getElementById('2408.13106v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13106v4-abstract-full" style="display: none;"> Self-supervised learning has been proved to benefit a wide range of speech processing tasks, such as speech recognition/translation, speaker verification and diarization, etc. However, most of current approaches are computationally expensive. In this paper, we propose a simplified and more efficient self-supervised learning framework termed as NeMo Encoder for Speech Tasks (NEST). Specifically, we adopt the FastConformer architecture with 8x sub-sampling rate, which is faster than Transformer or Conformer architectures. Instead of clustering-based quantization, we use fixed random projection for its simplicity and effectiveness. We also implement a generalized noisy speech augmentation that teaches the model to disentangle the main speaker from noise or other speakers. Experiments show that \model improves over existing self-supervised models and achieves new state-of-the-art performance on a variety of speech processing tasks, such as speech recognition/translation, speaker diarization, spoken language understanding, etc. Code and checkpoints will be publicly available via NVIDIA NeMo framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13106v4-abstract-full').style.display = 'none'; document.getElementById('2408.13106v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06870">arXiv:2408.06870</a> <span> [<a href="https://arxiv.org/pdf/2408.06870">pdf</a>, <a href="https://arxiv.org/ps/2408.06870">ps</a>, <a href="https://arxiv.org/format/2408.06870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Spectrum Prediction With Deep 3D Pyramid Vision Transformer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pan%2C+G">Guangliang Pan</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qihui Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+G">Guoru Ding</a>, <a href="/search/eess?searchtype=author&query=Yau%2C+D+K+Y">David K. Y. Yau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06870v3-abstract-short" style="display: inline;"> In this paper, we propose a deep learning (DL)-based task-driven spectrum prediction framework, named DeepSPred. The DeepSPred comprises a feature encoder and a task predictor, where the encoder extracts spectrum usage pattern features, and the predictor configures different networks according to the task requirements to predict future spectrum. Based on the Deep- SPred, we first propose a novel 3… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06870v3-abstract-full').style.display = 'inline'; document.getElementById('2408.06870v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06870v3-abstract-full" style="display: none;"> In this paper, we propose a deep learning (DL)-based task-driven spectrum prediction framework, named DeepSPred. The DeepSPred comprises a feature encoder and a task predictor, where the encoder extracts spectrum usage pattern features, and the predictor configures different networks according to the task requirements to predict future spectrum. Based on the Deep- SPred, we first propose a novel 3D spectrum prediction method combining a flow processing strategy with 3D vision Transformer (ViT, i.e., Swin) and a pyramid to serve possible applications such as spectrum monitoring task, named 3D-SwinSTB. 3D-SwinSTB unique 3D Patch Merging ViT-to-3D ViT Patch Expanding and pyramid designs help the model accurately learn the potential correlation of the evolution of the spectrogram over time. Then, we propose a novel spectrum occupancy rate (SOR) method by redesigning a predictor consisting exclusively of 3D convolutional and linear layers to serve possible applications such as dynamic spectrum access (DSA) task, named 3D-SwinLinear. Unlike the 3D-SwinSTB output spectrogram, 3D-SwinLinear projects the spectrogram directly as the SOR. Finally, we employ transfer learning (TL) to ensure the applicability of our two methods to diverse spectrum services. The results show that our 3D-SwinSTB outperforms recent benchmarks by more than 5%, while our 3D-SwinLinear achieves a 90% accuracy, with a performance improvement exceeding 10%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06870v3-abstract-full').style.display = 'none'; document.getElementById('2408.06870v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03055">arXiv:2408.03055</a> <span> [<a href="https://arxiv.org/pdf/2408.03055">pdf</a>, <a href="https://arxiv.org/format/2408.03055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FDA Jamming Against Airborne Phased-MIMO Radar-Part II: Jamming STAP Performance Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yan Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen-qin Wang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Z">Zhou He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03055v1-abstract-short" style="display: inline;"> The first part of this series introduced the effectiveness of frequency diverse array (FDA) jamming through direct wave propagation in countering airborne phased multiple-input multiple-output (Phased-MIMO) radar. This part focuses on the effectiveness of FDA scattered wave (FDA-SW) jamming on the space-time adaptive processing (STAP) for airborne phased-MIMO radar. Distinguished from the clutter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03055v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03055v1-abstract-full" style="display: none;"> The first part of this series introduced the effectiveness of frequency diverse array (FDA) jamming through direct wave propagation in countering airborne phased multiple-input multiple-output (Phased-MIMO) radar. This part focuses on the effectiveness of FDA scattered wave (FDA-SW) jamming on the space-time adaptive processing (STAP) for airborne phased-MIMO radar. Distinguished from the clutter signals, the ground equidistant scatterers of FDA-SW jamming constitute an elliptical ring, whose trajectory equations are mathematically derived to further determine the spatial frequency and Doppler frequency. For the phased-MIMO radar with different transmitting partitions, the effects of jamming frequency offset of FDA-SW on the clutter rank and STAP performance are discussed. Theoretical analysis provides the variation interval of clutter rank and the relationship between the jamming frequency offset and the improvement factor (IF) notch of phased-MIMO-STAP. Importantly, the requirements of jamming frequency offset for both two-part applications are discussed in this part. Numerical results verify these mathematical findings and validate the effectiveness of the proposed FDA jamming in countering the phased-MIMO radar. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03055v1-abstract-full').style.display = 'none'; document.getElementById('2408.03055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03050">arXiv:2408.03050</a> <span> [<a href="https://arxiv.org/pdf/2408.03050">pdf</a>, <a href="https://arxiv.org/format/2408.03050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FDA Jamming Against Airborne Phased-MIMO Radar-Part I: Matched Filtering and Spatial Filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yan Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen-qin Wang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Z">Zhou He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03050v1-abstract-short" style="display: inline;"> Phased multiple-input multiple-output (Phased-MIMO) radar has received increasing attention for enjoying the advantages of waveform diversity and range-dependency from frequency diverse array MIMO (FDA-MIMO) radar without sacrificing coherent processing gain through partitioning transmit subarray. This two-part series proposes a framework of electronic countermeasures (ECM) inspired by frequency d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03050v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03050v1-abstract-full" style="display: none;"> Phased multiple-input multiple-output (Phased-MIMO) radar has received increasing attention for enjoying the advantages of waveform diversity and range-dependency from frequency diverse array MIMO (FDA-MIMO) radar without sacrificing coherent processing gain through partitioning transmit subarray. This two-part series proposes a framework of electronic countermeasures (ECM) inspired by frequency diverse array (FDA) radar, called FDA jamming, evaluating its effectiveness for countering airborne phased-MIMO radar. This part introduces the principles and categories of FDA jammer and proposes the FDA jamming signal model based on the two cases of phased-MIMO radar, phased-array (PA) radar and FDA-MIMO radar. Moreover, the effects of FDA jamming on matched filtering and spatial filtering of PA and FDA-MIMO radar are analyzed. Numerical results verify the theoretical analysis and validate the effectiveness of the proposed FDA jamming in countering phased-MIMO radar. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03050v1-abstract-full').style.display = 'none'; document.getElementById('2408.03050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03045">arXiv:2408.03045</a> <span> [<a href="https://arxiv.org/pdf/2408.03045">pdf</a>, <a href="https://arxiv.org/format/2408.03045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Coherent FDA Radar: Transmitter and Receiver Design and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yan Sun</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+M">Ming-jie Jia</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen-qin Wang</a>, <a href="/search/eess?searchtype=author&query=Greco%2C+M+S">Maria Sabrina Greco</a>, <a href="/search/eess?searchtype=author&query=Gini%2C+F">Fulvio Gini</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03045v1-abstract-short" style="display: inline;"> The combination of frequency diverse array (FDA) radar technology with the multiple input multiple output (MIMO) radar architecture and waveform diversity techniques potentially promises a high integration gain with respect to conventional phased array (PA) radars. In this paper, we propose an approach to the design of the transmitter and the receiver of a coherent FDA (C-FDA) radar, that enables… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03045v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03045v1-abstract-full" style="display: none;"> The combination of frequency diverse array (FDA) radar technology with the multiple input multiple output (MIMO) radar architecture and waveform diversity techniques potentially promises a high integration gain with respect to conventional phased array (PA) radars. In this paper, we propose an approach to the design of the transmitter and the receiver of a coherent FDA (C-FDA) radar, that enables it to perform the demodulation with spectral overlapping, due to the small frequency offset. To this purpose, we derive the generalized space-time-range signal model and we prove that the proposed C-FDA radar has a higher coherent array gain than a PA radar, and at the same time, it effectively resolves the secondary range-ambiguous (SRA) problem of FDA-MIMO radar, allowing for mainlobe interference suppression and range-ambiguous clutter suppression. Numerical analysis results prove the effectiveness of the proposed C-FDA radar in terms on anti-interference and anti-clutter capabilities over conventional radars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03045v1-abstract-full').style.display = 'none'; document.getElementById('2408.03045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01738">arXiv:2408.01738</a> <span> [<a href="https://arxiv.org/pdf/2408.01738">pdf</a>, <a href="https://arxiv.org/format/2408.01738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Safety with Control Barrier Functions and Triggered Batch Least-Squares Identifier </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+J">Jiajun Shen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jing Zhou</a>, <a href="/search/eess?searchtype=author&query=L%C3%BC%2C+J">Jinhu L眉</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01738v2-abstract-short" style="display: inline;"> In this paper, a triggered Batch Least-Squares Identifier (BaLSI) based adaptive safety control scheme is proposed for uncertain systems with potentially conflicting control objectives and safety constraints. A relaxation term is added to the Quadratic Programs (QP) combining the transformed Control Lyapunov Functions (CLFs) and Control Barrier Functions (CBFs), to mediate the potential conflict.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01738v2-abstract-full').style.display = 'inline'; document.getElementById('2408.01738v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01738v2-abstract-full" style="display: none;"> In this paper, a triggered Batch Least-Squares Identifier (BaLSI) based adaptive safety control scheme is proposed for uncertain systems with potentially conflicting control objectives and safety constraints. A relaxation term is added to the Quadratic Programs (QP) combining the transformed Control Lyapunov Functions (CLFs) and Control Barrier Functions (CBFs), to mediate the potential conflict. The existing Lyapunov-based adaptive schemes designed to guarantee specific properties of the Lyapunov functions, may grow unboundedly under the effects of the relaxation term. The adaptive law is designed by processing system inputs and outputs, to avoid unbounded estimates and overparameterization problems in the existing results. A safetytriggered condition is presented, based on which the forward invariant property of the safe set is shown and Zeno behavior can be excluded. Simulation results are presented to demonstrate the effectiveness of the proposed adaptive control scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01738v2-abstract-full').style.display = 'none'; document.getElementById('2408.01738v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 fidures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01731">arXiv:2408.01731</a> <span> [<a href="https://arxiv.org/pdf/2408.01731">pdf</a>, <a href="https://arxiv.org/format/2408.01731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Composite Learning Adaptive Control without Excitation Condition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+J">Jiajun Shen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+C">Changyun Wen</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jinhu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01731v2-abstract-short" style="display: inline;"> This paper focuses on excitation collection and composite learning adaptive control design for uncertain nonlinear systems. By adopting the spectral decomposition technique, a linear regression equation is constructed to collect previously appeared excitation information, establishing a relationship between unknown parameters and the system's historical data. A composite learning term, developed u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01731v2-abstract-full').style.display = 'inline'; document.getElementById('2408.01731v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01731v2-abstract-full" style="display: none;"> This paper focuses on excitation collection and composite learning adaptive control design for uncertain nonlinear systems. By adopting the spectral decomposition technique, a linear regression equation is constructed to collect previously appeared excitation information, establishing a relationship between unknown parameters and the system's historical data. A composite learning term, developed using the linear regression equation, is incorporating into the Lyapunov-based parameter update law. In comparison to the existing results, all spectrums of previously appeared excitation information are collected, with the matrices in linear regression equation guaranteed to be bounded. This paper introduces concepts of excited and unexcited subspaces for analyzing the parameter estimation errors, and a novel Lyapunov function is developed for stability analysis. It is demonstrated that, without imposing any excitation condition, the state and excited parameter estimation error component converge to zero, while the unexcited component remains unchanged. Simulation results are provided to validate the theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01731v2-abstract-full').style.display = 'none'; document.getElementById('2408.01731v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00365">arXiv:2408.00365</a> <span> [<a href="https://arxiv.org/pdf/2408.00365">pdf</a>, <a href="https://arxiv.org/format/2408.00365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Fusion and Coherence Modeling for Video Topic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00365v1-abstract-short" style="display: inline;"> The video topic segmentation (VTS) task segments videos into intelligible, non-overlapping topics, facilitating efficient comprehension of video content and quick access to specific content. VTS is also critical to various downstream video understanding tasks. Traditional VTS methods using shallow features or unsupervised approaches struggle to accurately discern the nuances of topical transitions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00365v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00365v1-abstract-full" style="display: none;"> The video topic segmentation (VTS) task segments videos into intelligible, non-overlapping topics, facilitating efficient comprehension of video content and quick access to specific content. VTS is also critical to various downstream video understanding tasks. Traditional VTS methods using shallow features or unsupervised approaches struggle to accurately discern the nuances of topical transitions. Recently, supervised approaches have achieved superior performance on video action or scene segmentation over unsupervised approaches. In this work, we improve supervised VTS by thoroughly exploring multimodal fusion and multimodal coherence modeling. Specifically, (1) we enhance multimodal fusion by exploring different architectures using cross-attention and mixture of experts. (2) To generally strengthen multimodality alignment and fusion, we pre-train and fine-tune the model with multimodal contrastive learning. (3) We propose a new pre-training task tailored for the VTS task, and a novel fine-tuning task for enhancing multimodal coherence modeling for VTS. We evaluate the proposed approaches on educational videos, in the form of lectures, due to the vital role of topic segmentation of educational videos in boosting learning experiences. Additionally, we introduce a large-scale Chinese lecture video dataset to augment the existing English corpus, promoting further research in VTS. Experiments on both English and Chinese lecture datasets demonstrate that our model achieves superior VTS performance compared to competitive unsupervised and supervised baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00365v1-abstract-full').style.display = 'none'; document.getElementById('2408.00365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21400">arXiv:2407.21400</a> <span> [<a href="https://arxiv.org/pdf/2407.21400">pdf</a>, <a href="https://arxiv.org/format/2407.21400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Low-Coherence Sequence Design Under PAPR Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+G">Gangle Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenjin Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Wei Xu</a>, <a href="/search/eess?searchtype=author&query=Studer%2C+C">Christoph Studer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21400v2-abstract-short" style="display: inline;"> Low-coherence sequences with low peak-to-average power ratio (PAPR) are crucial for multi-carrier wireless communication systems and are used for pilots, spreading sequences, and so on. This letter proposes an efficient low-coherence sequence design algorithm (LOCEDA) that can generate any number of sequences of any length that satisfy user-defined PAPR constraints while supporting flexible subcar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21400v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21400v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21400v2-abstract-full" style="display: none;"> Low-coherence sequences with low peak-to-average power ratio (PAPR) are crucial for multi-carrier wireless communication systems and are used for pilots, spreading sequences, and so on. This letter proposes an efficient low-coherence sequence design algorithm (LOCEDA) that can generate any number of sequences of any length that satisfy user-defined PAPR constraints while supporting flexible subcarrier assignments in orthogonal frequency-division multiple access (OFDMA) systems. We first visualize the low-coherence sequence design problem under PAPR constraints as resolving collisions between hyperspheres. By iteratively adjusting the radii and positions of these hyperspheres, we effectively generate low-coherence sequences that strictly satisfy the imposed PAPR constraints. Simulation results (i) confirm that LOCEDA outperforms existing methods, (ii) demonstrate its flexibility, and (iii) highlight its potential for various applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21400v2-abstract-full').style.display = 'none'; document.getElementById('2407.21400v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in IEEE WCL, and the MATLAB code is available at: https://github.com/Gangle-Sun/IEEE-WCL-LOCEDA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19503">arXiv:2407.19503</a> <span> [<a href="https://arxiv.org/pdf/2407.19503">pdf</a>, <a href="https://arxiv.org/ps/2407.19503">ps</a>, <a href="https://arxiv.org/format/2407.19503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Discrete Spectrum Analysis of Vector OFDM Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xia%2C+X">Xiang-Gen Xia</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19503v1-abstract-short" style="display: inline;"> Vector OFDM (VOFDM) is equivalent to OTFS and is good for time-varying channels. However, due to its vector form, its signal spectrum is not as clear as that of the conventional OFDM. In this paper, we study the discrete spectrum of discrete VOFDM signals. We obtain a linear relationship between a vector of information symbols and a vector of the same size of components evenly distributed in the d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19503v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19503v1-abstract-full" style="display: none;"> Vector OFDM (VOFDM) is equivalent to OTFS and is good for time-varying channels. However, due to its vector form, its signal spectrum is not as clear as that of the conventional OFDM. In this paper, we study the discrete spectrum of discrete VOFDM signals. We obtain a linear relationship between a vector of information symbols and a vector of the same size of components evenly distributed in the discrete VOFDM signal spectrum, and show that if a vector of information symbols is set to 0, then a corresponding vector of the same size of the discrete VOFDM signal spectrum is 0 as well, where the components of the 0 vector are not together but evenly distributed in the spectrum. With the linear relationship, the information symbol vectors can be locally precoded so that any of the discrete spectrum of VOFDM signals can be set to 0, similar to that of the conventional OFDM signals. These results are verified by simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19503v1-abstract-full').style.display = 'none'; document.getElementById('2407.19503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18118">arXiv:2407.18118</a> <span> [<a href="https://arxiv.org/pdf/2407.18118">pdf</a>, <a href="https://arxiv.org/format/2407.18118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multipath Identification and Mitigation with FDA-MIMO Radar </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yizhen Jia</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen-Qin Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18118v1-abstract-short" style="display: inline;"> In smart city development, the automatic detection of structures and vehicles within urban or suburban areas via array radar (airborne or vehicle platforms) becomes crucial. However, the inescapable multipath effect adversely affects the radar's capability to detect and track targets. Frequency Diversity Array (FDA)-MIMO radar offers innovative solutions in mitigating multipath due to its frequenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18118v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18118v1-abstract-full" style="display: none;"> In smart city development, the automatic detection of structures and vehicles within urban or suburban areas via array radar (airborne or vehicle platforms) becomes crucial. However, the inescapable multipath effect adversely affects the radar's capability to detect and track targets. Frequency Diversity Array (FDA)-MIMO radar offers innovative solutions in mitigating multipath due to its frequency flexibility and waveform diversity traits amongst array elements. Hence, utilizing FDA-MIMO radar, this research proposes a multipath discrimination and suppression strategy to augment target detection and suppress false alarms. The primary advancement is the transformation of conventional multipath suppression into a multipath recognition issue, thereby enabling multipath components from single-frame echo data to be separated without prior knowledge. By offsetting the distance steering vectors of different objects to be detected, the accurate spectral information corresponding to the current distance unit can be extracted during spatial spectrum estimation. The direct and multipath components are differentiated depending on whether the transmitting and receiving angles match. Additionally, to mitigate high-order multipath, the echo intensity of multipath components is reduced via joint optimization of array transmit weighting and frequency increment. The numerical results show that the proposed algorithm can identify multipath at different distances in both single-target and multi-target scenarios, which is superior to the general MIMO radar. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18118v1-abstract-full').style.display = 'none'; document.getElementById('2407.18118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15245">arXiv:2407.15245</a> <span> [<a href="https://arxiv.org/pdf/2407.15245">pdf</a>, <a href="https://arxiv.org/ps/2407.15245">ps</a>, <a href="https://arxiv.org/format/2407.15245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Physics">math-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Weyl Calculus and Exactly Solvable Schr枚dinger Bridges with Quadratic State Cost </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Teter%2C+A+M+H">Alexis M. H. Teter</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenqing Wang</a>, <a href="/search/eess?searchtype=author&query=Halder%2C+A">Abhishek Halder</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15245v3-abstract-short" style="display: inline;"> Schr枚dinger bridge--a stochastic dynamical generalization of optimal mass transport--exhibits a learning-control duality. Viewed as a stochastic control problem, the Schr枚dinger bridge finds an optimal control policy that steers a given joint state statistics to another while minimizing the total control effort subject to controlled diffusion and deadline constraints. Viewed as a stochastic learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15245v3-abstract-full').style.display = 'inline'; document.getElementById('2407.15245v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15245v3-abstract-full" style="display: none;"> Schr枚dinger bridge--a stochastic dynamical generalization of optimal mass transport--exhibits a learning-control duality. Viewed as a stochastic control problem, the Schr枚dinger bridge finds an optimal control policy that steers a given joint state statistics to another while minimizing the total control effort subject to controlled diffusion and deadline constraints. Viewed as a stochastic learning problem, the Schr枚dinger bridge finds the most-likely distribution-valued trajectory connecting endpoint distributional observations, i.e., solves the two point boundary-constrained maximum likelihood problem over the manifold of probability distributions. Recent works have shown that solving the Schr枚dinger bridge problem with state cost requires finding the Markov kernel associated with a reaction-diffusion PDE where the state cost appears as a state-dependent reaction rate. We explain how ideas from Weyl calculus in quantum mechanics, specifically the Weyl operator and the Weyl symbol, can help determine such Markov kernels. We illustrate these ideas by explicitly finding the Markov kernel for the case of quadratic state cost via Weyl calculus, recovering our earlier results but avoiding tedious computation with Hermite polynomials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15245v3-abstract-full').style.display = 'none'; document.getElementById('2407.15245v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14329">arXiv:2407.14329</a> <span> [<a href="https://arxiv.org/pdf/2407.14329">pdf</a>, <a href="https://arxiv.org/format/2407.14329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Audio Captioning with Encoder-Level Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xuenan Xu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+M">Mengyue Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Plumbley%2C+M+D">Mark D. Plumbley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14329v1-abstract-short" style="display: inline;"> Significant improvement has been achieved in automated audio captioning (AAC) with recent models. However, these models have become increasingly large as their performance is enhanced. In this work, we propose a knowledge distillation (KD) framework for AAC. Our analysis shows that in the encoder-decoder based AAC models, it is more effective to distill knowledge into the encoder as compared with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14329v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14329v1-abstract-full" style="display: none;"> Significant improvement has been achieved in automated audio captioning (AAC) with recent models. However, these models have become increasingly large as their performance is enhanced. In this work, we propose a knowledge distillation (KD) framework for AAC. Our analysis shows that in the encoder-decoder based AAC models, it is more effective to distill knowledge into the encoder as compared with the decoder. To this end, we incorporate encoder-level KD loss into training, in addition to the standard supervised loss and sequence-level KD loss. We investigate two encoder-level KD methods, based on mean squared error (MSE) loss and contrastive loss, respectively. Experimental results demonstrate that contrastive KD is more robust than MSE KD, exhibiting superior performance in data-scarce situations. By leveraging audio-only data into training in the KD framework, our student model achieves competitive performance, with an inference speed that is 19 times faster\footnote{An online demo is available at \url{https://huggingface.co/spaces/wsntxxn/efficient_audio_captioning}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14329v1-abstract-full').style.display = 'none'; document.getElementById('2407.14329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11745">arXiv:2407.11745</a> <span> [<a href="https://arxiv.org/pdf/2407.11745">pdf</a>, <a href="https://arxiv.org/format/2407.11745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Universal Sound Separation with Self-Supervised Audio Masked Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Junqi Zhao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xubo Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jinzheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Y">Yi Yuan</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/eess?searchtype=author&query=Plumbley%2C+M+D">Mark D. Plumbley</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11745v2-abstract-short" style="display: inline;"> Universal sound separation (USS) is a task of separating mixtures of arbitrary sound sources. Typically, universal separation models are trained from scratch in a supervised manner, using labeled data. Self-supervised learning (SSL) is an emerging deep learning approach that leverages unlabeled data to obtain task-agnostic representations, which can benefit many downstream tasks. In this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11745v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11745v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11745v2-abstract-full" style="display: none;"> Universal sound separation (USS) is a task of separating mixtures of arbitrary sound sources. Typically, universal separation models are trained from scratch in a supervised manner, using labeled data. Self-supervised learning (SSL) is an emerging deep learning approach that leverages unlabeled data to obtain task-agnostic representations, which can benefit many downstream tasks. In this paper, we propose integrating a self-supervised pre-trained model, namely the audio masked autoencoder (A-MAE), into a universal sound separation system to enhance its separation performance. We employ two strategies to utilize SSL embeddings: freezing or updating the parameters of A-MAE during fine-tuning. The SSL embeddings are concatenated with the short-time Fourier transform (STFT) to serve as input features for the separation model. We evaluate our methods on the AudioSet dataset, and the experimental results indicate that the proposed methods successfully enhance the separation performance of a state-of-the-art ResUNet-based USS model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11745v2-abstract-full').style.display = 'none'; document.getElementById('2407.11745v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10373">arXiv:2407.10373</a> <span> [<a href="https://arxiv.org/pdf/2407.10373">pdf</a>, <a href="https://arxiv.org/format/2407.10373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mutual Learning for Acoustic Matching and Dereverberation via Visual Scene-driven Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jian Ma</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenguan Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+F">Feng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10373v1-abstract-short" style="display: inline;"> Visual acoustic matching (VAM) is pivotal for enhancing the immersive experience, and the task of dereverberation is effective in improving audio intelligibility. Existing methods treat each task independently, overlooking the inherent reciprocity between them. Moreover, these methods depend on paired training data, which is challenging to acquire, impeding the utilization of extensive unpaired da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10373v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10373v1-abstract-full" style="display: none;"> Visual acoustic matching (VAM) is pivotal for enhancing the immersive experience, and the task of dereverberation is effective in improving audio intelligibility. Existing methods treat each task independently, overlooking the inherent reciprocity between them. Moreover, these methods depend on paired training data, which is challenging to acquire, impeding the utilization of extensive unpaired data. In this paper, we introduce MVSD, a mutual learning framework based on diffusion models. MVSD considers the two tasks symmetrically, exploiting the reciprocal relationship to facilitate learning from inverse tasks and overcome data scarcity. Furthermore, we employ the diffusion model as foundational conditional converters to circumvent the training instability and over-smoothing drawbacks of conventional GAN architectures. Specifically, MVSD employs two converters: one for VAM called reverberator and one for dereverberation called dereverberator. The dereverberator judges whether the reverberation audio generated by reverberator sounds like being in the conditional visual scenario, and vice versa. By forming a closed loop, these two converters can generate informative feedback signals to optimize the inverse tasks, even with easily acquired one-way unpaired data. Extensive experiments on two standard benchmarks, i.e., SoundSpaces-Speech and Acoustic AVSpeech, exhibit that our framework can improve the performance of the reverberator and dereverberator and better match specified visual scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10373v1-abstract-full').style.display = 'none'; document.getElementById('2407.10373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024; Project page: https://hechang25.github.io/MVSD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10109">arXiv:2407.10109</a> <span> [<a href="https://arxiv.org/pdf/2407.10109">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hardware-Efficient and Reliable Coherent DSCM Systems Enabled by Single-Pilot-Tone-Based Polarization Demultiplexing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+D">Dongdong Zou</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+W">Weihao Ni</a>, <a href="/search/eess?searchtype=author&query=Li%2C+F">Fan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10109v1-abstract-short" style="display: inline;"> Recently, coherent digital subcarrier multiplexing (DSCM) technology has become an attractive solution for next-generation ultra-high-speed datacenter interconnects (DCIs). To meet the requirements of low-cost and low-power consumption in DCI applications, a comprehensive simplification of the coherent DSCM system has been investigated. The pilot-tone-based polarization demultiplexing (PT-PDM) tec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10109v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10109v1-abstract-full" style="display: none;"> Recently, coherent digital subcarrier multiplexing (DSCM) technology has become an attractive solution for next-generation ultra-high-speed datacenter interconnects (DCIs). To meet the requirements of low-cost and low-power consumption in DCI applications, a comprehensive simplification of the coherent DSCM system has been investigated. The pilot-tone-based polarization demultiplexing (PT-PDM) technique, known for its low-power consumption and ultra-fast polarization tracking capabilities, has emerged as a compelling alternative to the power-hungry N-tap adaptive multi-input multiple-output (MIMO) equalizer. However, the effectiveness of this PT-PDM technique is extremely vulnerable to the receiver-side XY-skew (Rx-XY-skew), which is revealed in this paper for the first time. Then, a pilot-tone-enabled modified Godard phase detector (PT-MGPD) scheme is proposed to realize Rx-XY-skew estimation, serving as the prerequisite for the successful implementation of the PT-PDM and simplification of the adaptive equalizer. Both the simulation and experiment are conducted to evaluate the accuracy of the proposed PT-MGPD scheme. The results prove it can achieve accurate estimation with an error of less than 0.3ps. Besides, a low-complexity, high-spectral-efficiency, and ultra-fast polarization demultiplexing method based on a single pilot tone (SPT) is proposed for the DSCM system in this work. Based on the proposed PT-MGPD and SPT schemes, the conventional N-tap MIMO equalizer served for each subcarrier can be successfully pruned into two polarization-independent single-input single-output equalizers, and there is no performance penalty even if the polarization rotation speed reaches 10Mrad/s. According to the results, the proposed schemes provide a hardware-efficient and reliable coherent DSCM solution for next-generation ultra-high-speed DCIs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10109v1-abstract-full').style.display = 'none'; document.getElementById('2407.10109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07056">arXiv:2407.07056</a> <span> [<a href="https://arxiv.org/pdf/2407.07056">pdf</a>, <a href="https://arxiv.org/format/2407.07056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CAPformer: Compression-Aware Pre-trained Transformer for Low-Light Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zhi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07056v2-abstract-short" style="display: inline;"> Low-Light Image Enhancement (LLIE) has advanced with the surge in phone photography demand, yet many existing methods neglect compression, a crucial concern for resource-constrained phone photography. Most LLIE methods overlook this, hindering their effectiveness. In this study, we investigate the effects of JPEG compression on low-light images and reveal substantial information loss caused by JPE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07056v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07056v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07056v2-abstract-full" style="display: none;"> Low-Light Image Enhancement (LLIE) has advanced with the surge in phone photography demand, yet many existing methods neglect compression, a crucial concern for resource-constrained phone photography. Most LLIE methods overlook this, hindering their effectiveness. In this study, we investigate the effects of JPEG compression on low-light images and reveal substantial information loss caused by JPEG due to widespread low pixel values in dark areas. Hence, we propose the Compression-Aware Pre-trained Transformer (CAPformer), employing a novel pre-training strategy to learn lossless information from uncompressed low-light images. Additionally, the proposed Brightness-Guided Self-Attention (BGSA) mechanism enhances rational information gathering. Experiments demonstrate the superiority of our approach in mitigating compression effects on LLIE, showcasing its potential for improving LLIE in resource-constrained scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07056v2-abstract-full').style.display = 'none'; document.getElementById('2407.07056v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05984">arXiv:2407.05984</a> <span> [<a href="https://arxiv.org/pdf/2407.05984">pdf</a>, <a href="https://arxiv.org/format/2407.05984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MBA-Net: SAM-driven Bidirectional Aggregation Network for Ovarian Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+W">Wei Xia</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenkui Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xin Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05984v1-abstract-short" style="display: inline;"> Accurate segmentation of ovarian tumors from medical images is crucial for early diagnosis, treatment planning, and patient management. However, the diverse morphological characteristics and heterogeneous appearances of ovarian tumors pose significant challenges to automated segmentation methods. In this paper, we propose MBA-Net, a novel architecture that integrates the powerful segmentation capa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05984v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05984v1-abstract-full" style="display: none;"> Accurate segmentation of ovarian tumors from medical images is crucial for early diagnosis, treatment planning, and patient management. However, the diverse morphological characteristics and heterogeneous appearances of ovarian tumors pose significant challenges to automated segmentation methods. In this paper, we propose MBA-Net, a novel architecture that integrates the powerful segmentation capabilities of the Segment Anything Model (SAM) with domain-specific knowledge for accurate and robust ovarian tumor segmentation. MBA-Net employs a hybrid encoder architecture, where the encoder consists of a prior branch, which inherits the SAM encoder to capture robust segmentation priors, and a domain branch, specifically designed to extract domain-specific features. The bidirectional flow of information between the two branches is facilitated by the robust feature injection network (RFIN) and the domain knowledge integration network (DKIN), enabling MBA-Net to leverage the complementary strengths of both branches. We extensively evaluate MBA-Net on the public multi-modality ovarian tumor ultrasound dataset and the in-house multi-site ovarian tumor MRI dataset. Our proposed method consistently outperforms state-of-the-art segmentation approaches. Moreover, MBA-Net demonstrates superior generalization capability across different imaging modalities and clinical sites. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05984v1-abstract-full').style.display = 'none'; document.getElementById('2407.05984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>